llama_cpp 0.9.3 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2491ee80a5e822375f140b7d465a6783be62ef9f98aa510495723bd2d80b3f81
4
- data.tar.gz: ad9ddbda1470602b976231edae030efd1ef0d854b41e0ce509e9b07ec78113e9
3
+ metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
4
+ data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
5
5
  SHA512:
6
- metadata.gz: 8a82ed440ae2bbe20f2c3818f22f88f1c5cab659060ad085a43ee657d1e60919acb74b9aac9b1d027fe84ddb30d170efc0e3799d33deddc59b4d34300332a798
7
- data.tar.gz: 164b4356580f0d2f17582fb84d59f0fbb9f816ac18921ea67d7cdda7f484620b605fdb88111ee32c1a42400c0770c520841304f7c2230ba577f4df1e5db453a0
6
+ metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
7
+ data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
2
+
3
+ - Bump bundled llama.cpp from b1523 to b1555.
4
+
1
5
  ## [[0.9.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.2...v0.9.3)] - 2023-11-18
2
6
 
3
7
  - Bump bundled llama.cpp from b1500 to b1523.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
235
236
  #endif //GGML_CUDA_F16
236
237
 
237
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
238
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
240
 
240
241
  int x32 = 0;
241
242
  x32 |= x16[0] << 0;
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
245
246
  }
246
247
 
247
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
248
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
250
 
250
251
  int x32 = 0;
251
252
  x32 |= x16[0] << 0;
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
255
256
  }
256
257
 
257
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
258
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
260
  }
260
261
 
261
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
262
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
264
  }
264
265
 
265
266
  template<typename T>
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
469
470
  #define MUL_MAT_SRC1_COL_STRIDE 128
470
471
 
471
472
  #define MAX_STREAMS 8
472
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
473
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
473
474
 
474
475
  struct ggml_tensor_extra_gpu {
475
476
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2248
2249
  }
2249
2250
 
2250
2251
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2252
+ (void)x_qh; (void)x_sc;
2251
2253
 
2252
2254
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2253
2255
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2259
2261
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2260
2262
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2261
2263
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2262
-
2264
+ (void)x_qh; (void)x_sc;
2263
2265
  GGML_CUDA_ASSUME(i_offset >= 0);
2264
2266
  GGML_CUDA_ASSUME(i_offset < nwarps);
2265
2267
  GGML_CUDA_ASSUME(k >= 0);
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2268
2270
  const int kbx = k / QI4_0;
2269
2271
  const int kqsx = k % QI4_0;
2270
2272
 
2271
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2273
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2272
2274
 
2273
2275
  float * x_dmf = (float *) x_dm;
2274
2276
 
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2306
2308
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2307
2309
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2308
2310
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2311
+ (void)x_qh; (void)x_sc;
2309
2312
 
2310
2313
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2311
- const float * x_dmf = (float *) x_dm;
2314
+ const float * x_dmf = (const float *) x_dm;
2312
2315
 
2313
2316
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2314
2317
 
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2342
2345
  }
2343
2346
 
2344
2347
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2348
+ (void)x_qh; (void)x_sc;
2345
2349
 
2346
2350
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2347
2351
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2353
2357
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2354
2358
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2355
2359
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2360
+ (void)x_qh; (void)x_sc;
2356
2361
 
2357
2362
  GGML_CUDA_ASSUME(i_offset >= 0);
2358
2363
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2362
2367
  const int kbx = k / QI4_1;
2363
2368
  const int kqsx = k % QI4_1;
2364
2369
 
2365
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2370
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2366
2371
 
2367
2372
  #pragma unroll
2368
2373
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2397
2402
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2398
2403
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2399
2404
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2405
+ (void)x_qh; (void)x_sc;
2400
2406
 
2401
2407
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2402
2408
 
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2434
2440
  }
2435
2441
 
2436
2442
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2443
+ (void)x_qh; (void)x_sc;
2437
2444
 
2438
2445
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2439
2446
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2445
2452
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2446
2453
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2447
2454
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2455
+ (void)x_qh; (void)x_sc;
2448
2456
 
2449
2457
  GGML_CUDA_ASSUME(i_offset >= 0);
2450
2458
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2454
2462
  const int kbx = k / QI5_0;
2455
2463
  const int kqsx = k % QI5_0;
2456
2464
 
2457
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2465
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2458
2466
 
2459
2467
  #pragma unroll
2460
2468
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2509
2517
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2510
2518
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2511
2519
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2520
+ (void)x_qh; (void)x_sc;
2512
2521
 
2513
2522
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2514
2523
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2548
2557
  }
2549
2558
 
2550
2559
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2560
+ (void)x_qh; (void)x_sc;
2551
2561
 
2552
2562
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2553
2563
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2559
2569
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2560
2570
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2561
2571
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2572
+ (void)x_qh; (void)x_sc;
2562
2573
 
2563
2574
  GGML_CUDA_ASSUME(i_offset >= 0);
2564
2575
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2568
2579
  const int kbx = k / QI5_1;
2569
2580
  const int kqsx = k % QI5_1;
2570
2581
 
2571
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2582
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2572
2583
 
2573
2584
  #pragma unroll
2574
2585
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2620
2631
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2621
2632
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2622
2633
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2634
+ (void)x_qh; (void)x_sc;
2623
2635
 
2624
2636
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2625
2637
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2654
2666
  }
2655
2667
 
2656
2668
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2669
+ (void)x_qh; (void)x_sc;
2657
2670
 
2658
2671
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2659
2672
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2665
2678
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2666
2679
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2667
2680
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2681
+ (void)x_qh; (void)x_sc;
2668
2682
 
2669
2683
  GGML_CUDA_ASSUME(i_offset >= 0);
2670
2684
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2675
2689
  const int kqsx = k % QI8_0;
2676
2690
  float * x_dmf = (float *) x_dm;
2677
2691
 
2678
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2692
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2679
2693
 
2680
2694
  #pragma unroll
2681
2695
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2710
2724
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2711
2725
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2712
2726
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2727
+ (void)x_qh; (void)x_sc;
2713
2728
 
2714
2729
  const float * x_dmf = (const float *) x_dm;
2715
2730
  const float * y_df = (const float *) y_ds;
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2743
2758
  }
2744
2759
 
2745
2760
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2761
+ (void)x_qh;
2746
2762
 
2747
2763
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2748
2764
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2756
2772
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2757
2773
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2758
2774
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2775
+ (void)x_qh;
2759
2776
 
2760
2777
  GGML_CUDA_ASSUME(i_offset >= 0);
2761
2778
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2765
2782
  const int kbx = k / QI2_K;
2766
2783
  const int kqsx = k % QI2_K;
2767
2784
 
2768
- const block_q2_K * bx0 = (block_q2_K *) vx;
2785
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2769
2786
 
2770
2787
  #pragma unroll
2771
2788
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2813
2830
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2814
2831
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2815
2832
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2833
+ (void)x_qh;
2816
2834
 
2817
2835
  const int kbx = k / QI2_K;
2818
2836
  const int ky = (k % QI2_K) * QR2_K;
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2886
2904
  const int kbx = k / QI3_K;
2887
2905
  const int kqsx = k % QI3_K;
2888
2906
 
2889
- const block_q3_K * bx0 = (block_q3_K *) vx;
2907
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2890
2908
 
2891
2909
  #pragma unroll
2892
2910
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2967
2985
  const float * x_dmf = (const float *) x_dm;
2968
2986
  const float * y_df = (const float *) y_ds;
2969
2987
 
2970
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2988
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2971
2989
 
2972
2990
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2973
2991
 
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3082
3100
  }
3083
3101
 
3084
3102
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3103
+ (void)x_qh;
3085
3104
 
3086
3105
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3087
3106
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3095
3114
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3096
3115
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3097
3116
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3117
+ (void)x_qh;
3098
3118
 
3099
3119
  GGML_CUDA_ASSUME(i_offset >= 0);
3100
3120
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3104
3124
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3105
3125
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3106
3126
 
3107
- const block_q4_K * bx0 = (block_q4_K *) vx;
3127
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3108
3128
 
3109
3129
  #pragma unroll
3110
3130
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3149
3169
 
3150
3170
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3151
3171
 
3152
- const int * scales = (int *) bxi->scales;
3172
+ const int * scales = (const int *) bxi->scales;
3153
3173
 
3154
3174
  const int ksc = k % (WARP_SIZE/8);
3155
3175
 
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3164
3184
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3165
3185
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3166
3186
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3187
+ (void)x_qh;
3167
3188
 
3168
3189
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3169
3190
 
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3263
3284
  }
3264
3285
 
3265
3286
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3287
+ (void)x_qh;
3266
3288
 
3267
3289
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3268
3290
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3276
3298
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3277
3299
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3278
3300
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3301
+ (void)x_qh;
3279
3302
 
3280
3303
  GGML_CUDA_ASSUME(i_offset >= 0);
3281
3304
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3285
3308
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3286
3309
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3287
3310
 
3288
- const block_q5_K * bx0 = (block_q5_K *) vx;
3311
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3289
3312
 
3290
3313
  #pragma unroll
3291
3314
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3341
3364
 
3342
3365
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3343
3366
 
3344
- const int * scales = (int *) bxi->scales;
3367
+ const int * scales = (const int *) bxi->scales;
3345
3368
 
3346
3369
  const int ksc = k % (WARP_SIZE/8);
3347
3370
 
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3356
3379
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3357
3380
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3358
3381
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3382
+ (void)x_qh;
3359
3383
 
3360
3384
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3361
3385
 
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3392
3416
  }
3393
3417
 
3394
3418
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3419
+ (void)x_qh;
3395
3420
 
3396
3421
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3397
3422
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3405
3430
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3406
3431
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3407
3432
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3433
+ (void)x_qh;
3408
3434
 
3409
3435
  GGML_CUDA_ASSUME(i_offset >= 0);
3410
3436
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3414
3440
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3415
3441
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3416
3442
 
3417
- const block_q6_K * bx0 = (block_q6_K *) vx;
3443
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3418
3444
 
3419
3445
  #pragma unroll
3420
3446
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3476
3502
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3477
3503
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3478
3504
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3505
+ (void)x_qh;
3479
3506
 
3480
3507
  const float * x_dmf = (const float *) x_dm;
3481
3508
  const float * y_df = (const float *) y_ds;
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
3518
3545
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3519
3546
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3520
3547
 
3521
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3548
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3522
3549
 
3523
3550
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3524
3551
 
@@ -6023,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6023
6050
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6024
6051
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6025
6052
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6026
- } else if (nb0 == ts) {
6053
+ }
6054
+ if (nb0 == ts) {
6027
6055
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6028
- } else {
6029
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6030
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6031
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6032
- // pretend the row is a matrix with cols=1
6033
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6034
- if (r != cudaSuccess) return r;
6035
- }
6036
- return cudaSuccess;
6037
6056
  }
6057
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
+ // pretend the row is a matrix with cols=1
6061
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
+ if (r != cudaSuccess) { return r; }
6063
+ }
6064
+ return cudaSuccess;
6038
6065
  }
6039
6066
 
6040
6067
  static void ggml_cuda_op_repeat(
@@ -6989,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
6989
7016
  const int64_t ne01 = src0->ne[1];
6990
7017
  const int64_t ne02 = src0->ne[2];
6991
7018
  const int64_t ne03 = src0->ne[3];
6992
- const int64_t nrows0 = ggml_nrows(src0);
7019
+ // const int64_t nrows0 = ggml_nrows(src0);
6993
7020
 
6994
7021
  const int64_t ne10 = src1->ne[0];
6995
7022
  const int64_t ne11 = src1->ne[1];
@@ -7090,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
7090
7117
  if (src0_on_device && src0_is_contiguous) {
7091
7118
  src0_dd[id] = (char *) src0_extra->data_device[id];
7092
7119
  } else {
7093
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7120
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7094
7121
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7095
7122
  }
7096
7123
 
@@ -7323,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7323
7350
  }
7324
7351
 
7325
7352
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
- if (!g_cublas_loaded) return false;
7353
+ if (!g_cublas_loaded) { return false; }
7327
7354
 
7328
7355
  const int64_t ne10 = src1->ne[0];
7329
7356
 
@@ -7401,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7401
7428
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7402
7429
  }
7403
7430
 
7404
- __global__ void k_compute_batched_ptrs(
7431
+ __global__ static void k_compute_batched_ptrs(
7405
7432
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7406
7433
  const void ** ptrs_src, void ** ptrs_dst,
7407
7434
  int ne12, int ne13,
@@ -8017,7 +8044,7 @@ void ggml_cuda_free_scratch() {
8017
8044
  }
8018
8045
 
8019
8046
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
- if (!g_cublas_loaded) return false;
8047
+ if (!g_cublas_loaded) { return false; }
8021
8048
 
8022
8049
  ggml_cuda_func_t func;
8023
8050
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8031,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8031
8058
  if (tensor->op == GGML_OP_MUL_MAT) {
8032
8059
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
8060
  #ifndef NDEBUG
8034
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8061
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
8062
  #endif
8036
8063
  return false;
8037
8064
  }
@@ -8316,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8316
8343
  UNUSED(cgraph);
8317
8344
  }
8318
8345
 
8319
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8346
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8320
8347
  GGML_ASSERT(!"not implemented");
8321
8348
 
8322
8349
  UNUSED(backend);
8323
8350
  UNUSED(plan);
8324
8351
  }
8325
8352
 
8326
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8353
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8327
8354
  GGML_ASSERT(!"not implemented");
8328
8355
 
8329
8356
  UNUSED(backend);
@@ -8339,8 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8339
8366
  for (int i = 0; i < cgraph->n_nodes; i++) {
8340
8367
  ggml_tensor * node = cgraph->nodes[i];
8341
8368
 
8342
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8369
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8343
8370
  continue;
8371
+ }
8344
8372
  assert(node->backend == GGML_BACKEND_GPU);
8345
8373
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8346
8374
  if (node->src[j] != nullptr) {
@@ -19,7 +19,7 @@
19
19
  #ifdef __wasm_simd128__
20
20
  #include <wasm_simd128.h>
21
21
  #else
22
- #ifdef __POWER9_VECTOR__
22
+ #if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
23
23
  #include <altivec.h>
24
24
  #undef bool
25
25
  #define bool _Bool