llama_cpp 0.9.3 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2491ee80a5e822375f140b7d465a6783be62ef9f98aa510495723bd2d80b3f81
4
- data.tar.gz: ad9ddbda1470602b976231edae030efd1ef0d854b41e0ce509e9b07ec78113e9
3
+ metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
4
+ data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
5
5
  SHA512:
6
- metadata.gz: 8a82ed440ae2bbe20f2c3818f22f88f1c5cab659060ad085a43ee657d1e60919acb74b9aac9b1d027fe84ddb30d170efc0e3799d33deddc59b4d34300332a798
7
- data.tar.gz: 164b4356580f0d2f17582fb84d59f0fbb9f816ac18921ea67d7cdda7f484620b605fdb88111ee32c1a42400c0770c520841304f7c2230ba577f4df1e5db453a0
6
+ metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
7
+ data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
2
+
3
+ - Bump bundled llama.cpp from b1523 to b1555.
4
+
1
5
  ## [[0.9.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.2...v0.9.3)] - 2023-11-18
2
6
 
3
7
  - Bump bundled llama.cpp from b1500 to b1523.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
235
236
  #endif //GGML_CUDA_F16
236
237
 
237
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
238
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
240
 
240
241
  int x32 = 0;
241
242
  x32 |= x16[0] << 0;
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
245
246
  }
246
247
 
247
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
248
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
250
 
250
251
  int x32 = 0;
251
252
  x32 |= x16[0] << 0;
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
255
256
  }
256
257
 
257
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
258
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
260
  }
260
261
 
261
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
262
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
264
  }
264
265
 
265
266
  template<typename T>
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
469
470
  #define MUL_MAT_SRC1_COL_STRIDE 128
470
471
 
471
472
  #define MAX_STREAMS 8
472
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
473
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
473
474
 
474
475
  struct ggml_tensor_extra_gpu {
475
476
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2248
2249
  }
2249
2250
 
2250
2251
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2252
+ (void)x_qh; (void)x_sc;
2251
2253
 
2252
2254
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2253
2255
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2259
2261
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2260
2262
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2261
2263
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2262
-
2264
+ (void)x_qh; (void)x_sc;
2263
2265
  GGML_CUDA_ASSUME(i_offset >= 0);
2264
2266
  GGML_CUDA_ASSUME(i_offset < nwarps);
2265
2267
  GGML_CUDA_ASSUME(k >= 0);
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2268
2270
  const int kbx = k / QI4_0;
2269
2271
  const int kqsx = k % QI4_0;
2270
2272
 
2271
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2273
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2272
2274
 
2273
2275
  float * x_dmf = (float *) x_dm;
2274
2276
 
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2306
2308
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2307
2309
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2308
2310
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2311
+ (void)x_qh; (void)x_sc;
2309
2312
 
2310
2313
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2311
- const float * x_dmf = (float *) x_dm;
2314
+ const float * x_dmf = (const float *) x_dm;
2312
2315
 
2313
2316
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2314
2317
 
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2342
2345
  }
2343
2346
 
2344
2347
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2348
+ (void)x_qh; (void)x_sc;
2345
2349
 
2346
2350
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2347
2351
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2353
2357
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2354
2358
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2355
2359
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2360
+ (void)x_qh; (void)x_sc;
2356
2361
 
2357
2362
  GGML_CUDA_ASSUME(i_offset >= 0);
2358
2363
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2362
2367
  const int kbx = k / QI4_1;
2363
2368
  const int kqsx = k % QI4_1;
2364
2369
 
2365
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2370
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2366
2371
 
2367
2372
  #pragma unroll
2368
2373
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2397
2402
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2398
2403
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2399
2404
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2405
+ (void)x_qh; (void)x_sc;
2400
2406
 
2401
2407
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2402
2408
 
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2434
2440
  }
2435
2441
 
2436
2442
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2443
+ (void)x_qh; (void)x_sc;
2437
2444
 
2438
2445
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2439
2446
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2445
2452
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2446
2453
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2447
2454
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2455
+ (void)x_qh; (void)x_sc;
2448
2456
 
2449
2457
  GGML_CUDA_ASSUME(i_offset >= 0);
2450
2458
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2454
2462
  const int kbx = k / QI5_0;
2455
2463
  const int kqsx = k % QI5_0;
2456
2464
 
2457
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2465
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2458
2466
 
2459
2467
  #pragma unroll
2460
2468
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2509
2517
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2510
2518
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2511
2519
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2520
+ (void)x_qh; (void)x_sc;
2512
2521
 
2513
2522
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2514
2523
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2548
2557
  }
2549
2558
 
2550
2559
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2560
+ (void)x_qh; (void)x_sc;
2551
2561
 
2552
2562
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2553
2563
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2559
2569
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2560
2570
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2561
2571
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2572
+ (void)x_qh; (void)x_sc;
2562
2573
 
2563
2574
  GGML_CUDA_ASSUME(i_offset >= 0);
2564
2575
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2568
2579
  const int kbx = k / QI5_1;
2569
2580
  const int kqsx = k % QI5_1;
2570
2581
 
2571
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2582
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2572
2583
 
2573
2584
  #pragma unroll
2574
2585
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2620
2631
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2621
2632
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2622
2633
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2634
+ (void)x_qh; (void)x_sc;
2623
2635
 
2624
2636
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2625
2637
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2654
2666
  }
2655
2667
 
2656
2668
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2669
+ (void)x_qh; (void)x_sc;
2657
2670
 
2658
2671
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2659
2672
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2665
2678
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2666
2679
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2667
2680
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2681
+ (void)x_qh; (void)x_sc;
2668
2682
 
2669
2683
  GGML_CUDA_ASSUME(i_offset >= 0);
2670
2684
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2675
2689
  const int kqsx = k % QI8_0;
2676
2690
  float * x_dmf = (float *) x_dm;
2677
2691
 
2678
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2692
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2679
2693
 
2680
2694
  #pragma unroll
2681
2695
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2710
2724
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2711
2725
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2712
2726
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2727
+ (void)x_qh; (void)x_sc;
2713
2728
 
2714
2729
  const float * x_dmf = (const float *) x_dm;
2715
2730
  const float * y_df = (const float *) y_ds;
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2743
2758
  }
2744
2759
 
2745
2760
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2761
+ (void)x_qh;
2746
2762
 
2747
2763
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2748
2764
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2756
2772
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2757
2773
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2758
2774
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2775
+ (void)x_qh;
2759
2776
 
2760
2777
  GGML_CUDA_ASSUME(i_offset >= 0);
2761
2778
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2765
2782
  const int kbx = k / QI2_K;
2766
2783
  const int kqsx = k % QI2_K;
2767
2784
 
2768
- const block_q2_K * bx0 = (block_q2_K *) vx;
2785
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2769
2786
 
2770
2787
  #pragma unroll
2771
2788
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2813
2830
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2814
2831
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2815
2832
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2833
+ (void)x_qh;
2816
2834
 
2817
2835
  const int kbx = k / QI2_K;
2818
2836
  const int ky = (k % QI2_K) * QR2_K;
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2886
2904
  const int kbx = k / QI3_K;
2887
2905
  const int kqsx = k % QI3_K;
2888
2906
 
2889
- const block_q3_K * bx0 = (block_q3_K *) vx;
2907
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2890
2908
 
2891
2909
  #pragma unroll
2892
2910
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2967
2985
  const float * x_dmf = (const float *) x_dm;
2968
2986
  const float * y_df = (const float *) y_ds;
2969
2987
 
2970
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2988
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2971
2989
 
2972
2990
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2973
2991
 
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3082
3100
  }
3083
3101
 
3084
3102
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3103
+ (void)x_qh;
3085
3104
 
3086
3105
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3087
3106
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3095
3114
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3096
3115
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3097
3116
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3117
+ (void)x_qh;
3098
3118
 
3099
3119
  GGML_CUDA_ASSUME(i_offset >= 0);
3100
3120
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3104
3124
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3105
3125
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3106
3126
 
3107
- const block_q4_K * bx0 = (block_q4_K *) vx;
3127
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3108
3128
 
3109
3129
  #pragma unroll
3110
3130
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3149
3169
 
3150
3170
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3151
3171
 
3152
- const int * scales = (int *) bxi->scales;
3172
+ const int * scales = (const int *) bxi->scales;
3153
3173
 
3154
3174
  const int ksc = k % (WARP_SIZE/8);
3155
3175
 
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3164
3184
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3165
3185
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3166
3186
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3187
+ (void)x_qh;
3167
3188
 
3168
3189
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3169
3190
 
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3263
3284
  }
3264
3285
 
3265
3286
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3287
+ (void)x_qh;
3266
3288
 
3267
3289
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3268
3290
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3276
3298
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3277
3299
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3278
3300
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3301
+ (void)x_qh;
3279
3302
 
3280
3303
  GGML_CUDA_ASSUME(i_offset >= 0);
3281
3304
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3285
3308
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3286
3309
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3287
3310
 
3288
- const block_q5_K * bx0 = (block_q5_K *) vx;
3311
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3289
3312
 
3290
3313
  #pragma unroll
3291
3314
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3341
3364
 
3342
3365
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3343
3366
 
3344
- const int * scales = (int *) bxi->scales;
3367
+ const int * scales = (const int *) bxi->scales;
3345
3368
 
3346
3369
  const int ksc = k % (WARP_SIZE/8);
3347
3370
 
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3356
3379
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3357
3380
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3358
3381
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3382
+ (void)x_qh;
3359
3383
 
3360
3384
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3361
3385
 
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3392
3416
  }
3393
3417
 
3394
3418
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3419
+ (void)x_qh;
3395
3420
 
3396
3421
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3397
3422
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3405
3430
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3406
3431
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3407
3432
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3433
+ (void)x_qh;
3408
3434
 
3409
3435
  GGML_CUDA_ASSUME(i_offset >= 0);
3410
3436
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3414
3440
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3415
3441
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3416
3442
 
3417
- const block_q6_K * bx0 = (block_q6_K *) vx;
3443
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3418
3444
 
3419
3445
  #pragma unroll
3420
3446
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3476
3502
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3477
3503
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3478
3504
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3505
+ (void)x_qh;
3479
3506
 
3480
3507
  const float * x_dmf = (const float *) x_dm;
3481
3508
  const float * y_df = (const float *) y_ds;
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
3518
3545
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3519
3546
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3520
3547
 
3521
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3548
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3522
3549
 
3523
3550
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3524
3551
 
@@ -6023,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6023
6050
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6024
6051
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6025
6052
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6026
- } else if (nb0 == ts) {
6053
+ }
6054
+ if (nb0 == ts) {
6027
6055
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6028
- } else {
6029
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6030
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6031
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6032
- // pretend the row is a matrix with cols=1
6033
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6034
- if (r != cudaSuccess) return r;
6035
- }
6036
- return cudaSuccess;
6037
6056
  }
6057
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
+ // pretend the row is a matrix with cols=1
6061
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
+ if (r != cudaSuccess) { return r; }
6063
+ }
6064
+ return cudaSuccess;
6038
6065
  }
6039
6066
 
6040
6067
  static void ggml_cuda_op_repeat(
@@ -6989,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
6989
7016
  const int64_t ne01 = src0->ne[1];
6990
7017
  const int64_t ne02 = src0->ne[2];
6991
7018
  const int64_t ne03 = src0->ne[3];
6992
- const int64_t nrows0 = ggml_nrows(src0);
7019
+ // const int64_t nrows0 = ggml_nrows(src0);
6993
7020
 
6994
7021
  const int64_t ne10 = src1->ne[0];
6995
7022
  const int64_t ne11 = src1->ne[1];
@@ -7090,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
7090
7117
  if (src0_on_device && src0_is_contiguous) {
7091
7118
  src0_dd[id] = (char *) src0_extra->data_device[id];
7092
7119
  } else {
7093
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7120
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7094
7121
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7095
7122
  }
7096
7123
 
@@ -7323,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7323
7350
  }
7324
7351
 
7325
7352
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
- if (!g_cublas_loaded) return false;
7353
+ if (!g_cublas_loaded) { return false; }
7327
7354
 
7328
7355
  const int64_t ne10 = src1->ne[0];
7329
7356
 
@@ -7401,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7401
7428
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7402
7429
  }
7403
7430
 
7404
- __global__ void k_compute_batched_ptrs(
7431
+ __global__ static void k_compute_batched_ptrs(
7405
7432
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7406
7433
  const void ** ptrs_src, void ** ptrs_dst,
7407
7434
  int ne12, int ne13,
@@ -8017,7 +8044,7 @@ void ggml_cuda_free_scratch() {
8017
8044
  }
8018
8045
 
8019
8046
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
- if (!g_cublas_loaded) return false;
8047
+ if (!g_cublas_loaded) { return false; }
8021
8048
 
8022
8049
  ggml_cuda_func_t func;
8023
8050
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8031,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8031
8058
  if (tensor->op == GGML_OP_MUL_MAT) {
8032
8059
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
8060
  #ifndef NDEBUG
8034
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8061
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
8062
  #endif
8036
8063
  return false;
8037
8064
  }
@@ -8316,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8316
8343
  UNUSED(cgraph);
8317
8344
  }
8318
8345
 
8319
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8346
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8320
8347
  GGML_ASSERT(!"not implemented");
8321
8348
 
8322
8349
  UNUSED(backend);
8323
8350
  UNUSED(plan);
8324
8351
  }
8325
8352
 
8326
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8353
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8327
8354
  GGML_ASSERT(!"not implemented");
8328
8355
 
8329
8356
  UNUSED(backend);
@@ -8339,8 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8339
8366
  for (int i = 0; i < cgraph->n_nodes; i++) {
8340
8367
  ggml_tensor * node = cgraph->nodes[i];
8341
8368
 
8342
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8369
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8343
8370
  continue;
8371
+ }
8344
8372
  assert(node->backend == GGML_BACKEND_GPU);
8345
8373
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8346
8374
  if (node->src[j] != nullptr) {
@@ -19,7 +19,7 @@
19
19
  #ifdef __wasm_simd128__
20
20
  #include <wasm_simd128.h>
21
21
  #else
22
- #ifdef __POWER9_VECTOR__
22
+ #if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
23
23
  #include <altivec.h>
24
24
  #undef bool
25
25
  #define bool _Bool