llama_cpp 0.9.3 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
235
236
  #endif //GGML_CUDA_F16
236
237
 
237
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
238
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
240
 
240
241
  int x32 = 0;
241
242
  x32 |= x16[0] << 0;
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
245
246
  }
246
247
 
247
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
248
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
250
 
250
251
  int x32 = 0;
251
252
  x32 |= x16[0] << 0;
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
255
256
  }
256
257
 
257
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
258
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
260
  }
260
261
 
261
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
262
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
264
  }
264
265
 
265
266
  template<typename T>
@@ -442,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
442
443
  #define CUDA_SCALE_BLOCK_SIZE 256
443
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
444
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
445
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
446
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
447
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -469,7 +471,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
469
471
  #define MUL_MAT_SRC1_COL_STRIDE 128
470
472
 
471
473
  #define MAX_STREAMS 8
472
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
474
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
473
475
 
474
476
  struct ggml_tensor_extra_gpu {
475
477
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -500,6 +502,31 @@ static size_t g_scratch_offset = 0;
500
502
 
501
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
502
504
 
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
+ }
510
+ return x;
511
+ }
512
+
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
521
+
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
526
+ }
527
+ return x;
528
+ }
529
+
503
530
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
504
531
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
505
532
 
@@ -576,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
576
603
  dst[i] = x[i] * x[i];
577
604
  }
578
605
 
579
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
580
- #pragma unroll
581
- for (int mask = 16; mask > 0; mask >>= 1) {
582
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
583
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
584
- }
585
- return a;
586
- }
587
-
588
606
  template <int block_size>
589
607
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
590
608
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -623,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
623
641
  }
624
642
  }
625
643
 
626
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
627
- #pragma unroll
628
- for (int mask = 16; mask > 0; mask >>= 1) {
629
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
630
- }
631
- return x;
632
- }
633
-
634
644
  template <int block_size>
635
645
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
636
646
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -2248,6 +2258,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2248
2258
  }
2249
2259
 
2250
2260
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2261
+ (void)x_qh; (void)x_sc;
2251
2262
 
2252
2263
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2253
2264
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2259,7 +2270,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2259
2270
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2260
2271
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2261
2272
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2262
-
2273
+ (void)x_qh; (void)x_sc;
2263
2274
  GGML_CUDA_ASSUME(i_offset >= 0);
2264
2275
  GGML_CUDA_ASSUME(i_offset < nwarps);
2265
2276
  GGML_CUDA_ASSUME(k >= 0);
@@ -2268,7 +2279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2268
2279
  const int kbx = k / QI4_0;
2269
2280
  const int kqsx = k % QI4_0;
2270
2281
 
2271
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2282
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2272
2283
 
2273
2284
  float * x_dmf = (float *) x_dm;
2274
2285
 
@@ -2306,9 +2317,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2306
2317
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2307
2318
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2308
2319
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2320
+ (void)x_qh; (void)x_sc;
2309
2321
 
2310
2322
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2311
- const float * x_dmf = (float *) x_dm;
2323
+ const float * x_dmf = (const float *) x_dm;
2312
2324
 
2313
2325
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2314
2326
 
@@ -2342,6 +2354,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2342
2354
  }
2343
2355
 
2344
2356
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2357
+ (void)x_qh; (void)x_sc;
2345
2358
 
2346
2359
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2347
2360
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2353,6 +2366,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2353
2366
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2354
2367
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2355
2368
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2369
+ (void)x_qh; (void)x_sc;
2356
2370
 
2357
2371
  GGML_CUDA_ASSUME(i_offset >= 0);
2358
2372
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2362,7 +2376,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2362
2376
  const int kbx = k / QI4_1;
2363
2377
  const int kqsx = k % QI4_1;
2364
2378
 
2365
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2379
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2366
2380
 
2367
2381
  #pragma unroll
2368
2382
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2397,6 +2411,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2397
2411
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2398
2412
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2399
2413
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2414
+ (void)x_qh; (void)x_sc;
2400
2415
 
2401
2416
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2402
2417
 
@@ -2434,6 +2449,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2434
2449
  }
2435
2450
 
2436
2451
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2452
+ (void)x_qh; (void)x_sc;
2437
2453
 
2438
2454
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2439
2455
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2445,6 +2461,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2445
2461
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2446
2462
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2447
2463
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2464
+ (void)x_qh; (void)x_sc;
2448
2465
 
2449
2466
  GGML_CUDA_ASSUME(i_offset >= 0);
2450
2467
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2454,7 +2471,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2454
2471
  const int kbx = k / QI5_0;
2455
2472
  const int kqsx = k % QI5_0;
2456
2473
 
2457
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2474
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2458
2475
 
2459
2476
  #pragma unroll
2460
2477
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2509,6 +2526,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2509
2526
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2510
2527
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2511
2528
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2529
+ (void)x_qh; (void)x_sc;
2512
2530
 
2513
2531
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2514
2532
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2548,6 +2566,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2548
2566
  }
2549
2567
 
2550
2568
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2569
+ (void)x_qh; (void)x_sc;
2551
2570
 
2552
2571
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2553
2572
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2559,6 +2578,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2559
2578
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2560
2579
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2561
2580
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2581
+ (void)x_qh; (void)x_sc;
2562
2582
 
2563
2583
  GGML_CUDA_ASSUME(i_offset >= 0);
2564
2584
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2568,7 +2588,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2568
2588
  const int kbx = k / QI5_1;
2569
2589
  const int kqsx = k % QI5_1;
2570
2590
 
2571
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2591
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2572
2592
 
2573
2593
  #pragma unroll
2574
2594
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2620,6 +2640,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2620
2640
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2621
2641
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2622
2642
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2643
+ (void)x_qh; (void)x_sc;
2623
2644
 
2624
2645
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2625
2646
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2654,6 +2675,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2654
2675
  }
2655
2676
 
2656
2677
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2678
+ (void)x_qh; (void)x_sc;
2657
2679
 
2658
2680
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2659
2681
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2665,6 +2687,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2665
2687
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2666
2688
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2667
2689
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2690
+ (void)x_qh; (void)x_sc;
2668
2691
 
2669
2692
  GGML_CUDA_ASSUME(i_offset >= 0);
2670
2693
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2675,7 +2698,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2675
2698
  const int kqsx = k % QI8_0;
2676
2699
  float * x_dmf = (float *) x_dm;
2677
2700
 
2678
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2701
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2679
2702
 
2680
2703
  #pragma unroll
2681
2704
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2710,6 +2733,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2710
2733
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2711
2734
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2712
2735
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2736
+ (void)x_qh; (void)x_sc;
2713
2737
 
2714
2738
  const float * x_dmf = (const float *) x_dm;
2715
2739
  const float * y_df = (const float *) y_ds;
@@ -2743,6 +2767,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2743
2767
  }
2744
2768
 
2745
2769
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2770
+ (void)x_qh;
2746
2771
 
2747
2772
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2748
2773
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2756,6 +2781,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2756
2781
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2757
2782
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2758
2783
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
+ (void)x_qh;
2759
2785
 
2760
2786
  GGML_CUDA_ASSUME(i_offset >= 0);
2761
2787
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2765,7 +2791,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2765
2791
  const int kbx = k / QI2_K;
2766
2792
  const int kqsx = k % QI2_K;
2767
2793
 
2768
- const block_q2_K * bx0 = (block_q2_K *) vx;
2794
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2769
2795
 
2770
2796
  #pragma unroll
2771
2797
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2813,6 +2839,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2813
2839
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2814
2840
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2815
2841
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2842
+ (void)x_qh;
2816
2843
 
2817
2844
  const int kbx = k / QI2_K;
2818
2845
  const int ky = (k % QI2_K) * QR2_K;
@@ -2886,7 +2913,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2886
2913
  const int kbx = k / QI3_K;
2887
2914
  const int kqsx = k % QI3_K;
2888
2915
 
2889
- const block_q3_K * bx0 = (block_q3_K *) vx;
2916
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2890
2917
 
2891
2918
  #pragma unroll
2892
2919
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2967,7 +2994,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2967
2994
  const float * x_dmf = (const float *) x_dm;
2968
2995
  const float * y_df = (const float *) y_ds;
2969
2996
 
2970
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2997
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2971
2998
 
2972
2999
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2973
3000
 
@@ -3082,6 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3082
3109
  }
3083
3110
 
3084
3111
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3112
+ (void)x_qh;
3085
3113
 
3086
3114
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3087
3115
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3095,6 +3123,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3095
3123
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3096
3124
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3097
3125
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3126
+ (void)x_qh;
3098
3127
 
3099
3128
  GGML_CUDA_ASSUME(i_offset >= 0);
3100
3129
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3104,7 +3133,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3104
3133
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3105
3134
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3106
3135
 
3107
- const block_q4_K * bx0 = (block_q4_K *) vx;
3136
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3108
3137
 
3109
3138
  #pragma unroll
3110
3139
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3149,7 +3178,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3149
3178
 
3150
3179
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3151
3180
 
3152
- const int * scales = (int *) bxi->scales;
3181
+ const int * scales = (const int *) bxi->scales;
3153
3182
 
3154
3183
  const int ksc = k % (WARP_SIZE/8);
3155
3184
 
@@ -3164,6 +3193,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3164
3193
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3165
3194
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3166
3195
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3196
+ (void)x_qh;
3167
3197
 
3168
3198
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3169
3199
 
@@ -3263,6 +3293,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3263
3293
  }
3264
3294
 
3265
3295
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3296
+ (void)x_qh;
3266
3297
 
3267
3298
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3268
3299
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3276,6 +3307,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3276
3307
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3277
3308
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3278
3309
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3310
+ (void)x_qh;
3279
3311
 
3280
3312
  GGML_CUDA_ASSUME(i_offset >= 0);
3281
3313
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3285,7 +3317,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3285
3317
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3286
3318
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3287
3319
 
3288
- const block_q5_K * bx0 = (block_q5_K *) vx;
3320
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3289
3321
 
3290
3322
  #pragma unroll
3291
3323
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3341,7 +3373,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3341
3373
 
3342
3374
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3343
3375
 
3344
- const int * scales = (int *) bxi->scales;
3376
+ const int * scales = (const int *) bxi->scales;
3345
3377
 
3346
3378
  const int ksc = k % (WARP_SIZE/8);
3347
3379
 
@@ -3356,6 +3388,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3356
3388
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3357
3389
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3358
3390
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3391
+ (void)x_qh;
3359
3392
 
3360
3393
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3361
3394
 
@@ -3392,6 +3425,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3392
3425
  }
3393
3426
 
3394
3427
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3428
+ (void)x_qh;
3395
3429
 
3396
3430
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3397
3431
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3405,6 +3439,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3405
3439
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3406
3440
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3407
3441
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3442
+ (void)x_qh;
3408
3443
 
3409
3444
  GGML_CUDA_ASSUME(i_offset >= 0);
3410
3445
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3414,7 +3449,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3414
3449
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3415
3450
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3416
3451
 
3417
- const block_q6_K * bx0 = (block_q6_K *) vx;
3452
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3418
3453
 
3419
3454
  #pragma unroll
3420
3455
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3476,6 +3511,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3476
3511
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3477
3512
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3478
3513
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3514
+ (void)x_qh;
3479
3515
 
3480
3516
  const float * x_dmf = (const float *) x_dm;
3481
3517
  const float * y_df = (const float *) y_ds;
@@ -3518,7 +3554,7 @@ static __device__ __forceinline__ void mul_mat_q(
3518
3554
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3519
3555
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3520
3556
 
3521
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3557
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3522
3558
 
3523
3559
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3524
3560
 
@@ -4583,8 +4619,8 @@ static __global__ void rope(
4583
4619
 
4584
4620
  template<typename T, bool has_pos>
4585
4621
  static __global__ void rope_neox(
4586
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4622
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4623
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4588
4624
  ) {
4589
4625
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4590
4626
 
@@ -4593,23 +4629,25 @@ static __global__ void rope_neox(
4593
4629
  }
4594
4630
 
4595
4631
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4596
- const int i = row*ncols + col/2;
4632
+ const int ib = col / n_dims;
4633
+ const int ic = col % n_dims;
4634
+
4635
+ const int i = row*ncols + ib*n_dims + ic/2;
4597
4636
  const int i2 = row/p_delta_rows;
4598
4637
 
4599
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4600
- const float cur_rot = -float(col)/ncols;
4638
+ float cur_rot = inv_ndims * ic - ib;
4601
4639
 
4602
4640
  const int p = has_pos ? pos[i2] : 0;
4603
- const float theta_base = p*powf(freq_base, cur_rot);
4641
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4604
4642
 
4605
4643
  float cos_theta, sin_theta;
4606
4644
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4607
4645
 
4608
4646
  const float x0 = x[i + 0];
4609
- const float x1 = x[i + ncols/2];
4647
+ const float x1 = x[i + n_dims/2];
4610
4648
 
4611
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4612
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4649
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4650
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4613
4651
  }
4614
4652
 
4615
4653
  static __global__ void rope_glm_f32(
@@ -4688,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4688
4726
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4689
4727
  }
4690
4728
 
4691
- // the CUDA soft max implementation differs from the CPU implementation
4692
- // instead of doubles floats are used
4693
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4694
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4695
- const int block_size = blockDim.y;
4696
- const int tid = threadIdx.y;
4729
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4730
+ const int tid = threadIdx.x;
4731
+ const int rowx = blockIdx.x;
4732
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4733
+
4734
+ const int block_size = blockDim.x;
4735
+
4736
+ const int warp_id = threadIdx.x / WARP_SIZE;
4737
+ const int lane_id = threadIdx.x % WARP_SIZE;
4738
+
4739
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4697
4740
 
4698
4741
  float max_val = -INFINITY;
4699
4742
 
4700
4743
  for (int col = tid; col < ncols; col += block_size) {
4701
- const int i = row*ncols + col;
4702
- max_val = max(max_val, x[i]);
4744
+ const int ix = rowx*ncols + col;
4745
+ const int iy = rowy*ncols + col;
4746
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4703
4747
  }
4704
4748
 
4705
4749
  // find the max value in the block
4706
- #pragma unroll
4707
- for (int mask = 16; mask > 0; mask >>= 1) {
4708
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4750
+ max_val = warp_reduce_max(max_val);
4751
+ if (block_size > WARP_SIZE) {
4752
+ if (warp_id == 0) {
4753
+ buf[lane_id] = -INFINITY;
4754
+ }
4755
+ __syncthreads();
4756
+
4757
+ if (lane_id == 0) {
4758
+ buf[warp_id] = max_val;
4759
+ }
4760
+ __syncthreads();
4761
+
4762
+ max_val = buf[lane_id];
4763
+ max_val = warp_reduce_max(max_val);
4709
4764
  }
4710
4765
 
4711
4766
  float tmp = 0.f;
4712
4767
 
4713
4768
  for (int col = tid; col < ncols; col += block_size) {
4714
- const int i = row*ncols + col;
4715
- const float val = expf(x[i] - max_val);
4769
+ const int ix = rowx*ncols + col;
4770
+ const int iy = rowy*ncols + col;
4771
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4716
4772
  tmp += val;
4717
- dst[i] = val;
4773
+ dst[ix] = val;
4718
4774
  }
4719
4775
 
4720
- // sum up partial sums
4721
- #pragma unroll
4722
- for (int mask = 16; mask > 0; mask >>= 1) {
4723
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4776
+ // find the sum of exps in the block
4777
+ tmp = warp_reduce_sum(tmp);
4778
+ if (block_size > WARP_SIZE) {
4779
+ if (warp_id == 0) {
4780
+ buf[lane_id] = 0.f;
4781
+ }
4782
+ __syncthreads();
4783
+
4784
+ if (lane_id == 0) {
4785
+ buf[warp_id] = tmp;
4786
+ }
4787
+ __syncthreads();
4788
+
4789
+ tmp = buf[lane_id];
4790
+ tmp = warp_reduce_sum(tmp);
4724
4791
  }
4725
4792
 
4726
4793
  const float inv_tmp = 1.f / tmp;
4727
4794
 
4728
4795
  for (int col = tid; col < ncols; col += block_size) {
4729
- const int i = row*ncols + col;
4796
+ const int i = rowx*ncols + col;
4730
4797
  dst[i] *= inv_tmp;
4731
4798
  }
4732
4799
  }
@@ -5712,20 +5779,26 @@ static void rope_cuda(
5712
5779
 
5713
5780
  template<typename T>
5714
5781
  static void rope_neox_cuda(
5715
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5782
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5716
5783
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5717
5784
  ) {
5718
5785
  GGML_ASSERT(ncols % 2 == 0);
5719
5786
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5720
5787
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5721
5788
  const dim3 block_nums(nrows, num_blocks_x, 1);
5789
+
5790
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
5791
+ const float inv_ndims = -1.0f / n_dims;
5792
+
5722
5793
  if (pos == nullptr) {
5723
5794
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5724
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5795
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5796
+ theta_scale, inv_ndims
5725
5797
  );
5726
5798
  } else {
5727
5799
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5728
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5800
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5801
+ theta_scale, inv_ndims
5729
5802
  );
5730
5803
  }
5731
5804
  }
@@ -5757,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5757
5830
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5758
5831
  }
5759
5832
 
5760
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5761
- const dim3 block_dims(1, WARP_SIZE, 1);
5833
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
5834
+ int nth = WARP_SIZE;
5835
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
5836
+ const dim3 block_dims(nth, 1, 1);
5762
5837
  const dim3 block_nums(nrows_x, 1, 1);
5763
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5838
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5764
5839
  }
5765
5840
 
5766
5841
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -6023,18 +6098,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6023
6098
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6024
6099
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6025
6100
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6026
- } else if (nb0 == ts) {
6101
+ }
6102
+ if (nb0 == ts) {
6027
6103
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6028
- } else {
6029
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6030
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6031
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6032
- // pretend the row is a matrix with cols=1
6033
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6034
- if (r != cudaSuccess) return r;
6035
- }
6036
- return cudaSuccess;
6037
6104
  }
6105
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6106
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6107
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6108
+ // pretend the row is a matrix with cols=1
6109
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6110
+ if (r != cudaSuccess) { return r; }
6111
+ }
6112
+ return cudaSuccess;
6038
6113
  }
6039
6114
 
6040
6115
  static void ggml_cuda_op_repeat(
@@ -6680,15 +6755,14 @@ inline void ggml_cuda_op_rope(
6680
6755
  GGML_ASSERT(false);
6681
6756
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6682
6757
  } else if (is_neox) {
6683
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6684
6758
  if (src0->type == GGML_TYPE_F32) {
6685
6759
  rope_neox_cuda(
6686
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6760
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6687
6761
  attn_factor, corr_dims, main_stream
6688
6762
  );
6689
6763
  } else if (src0->type == GGML_TYPE_F16) {
6690
6764
  rope_neox_cuda(
6691
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6765
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6692
6766
  attn_factor, corr_dims, main_stream
6693
6767
  );
6694
6768
  } else {
@@ -6812,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
6812
6886
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6813
6887
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6814
6888
 
6889
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
6890
+
6815
6891
  const int64_t ne00 = src0->ne[0];
6816
- const int64_t nrows = ggml_nrows(src0);
6892
+ const int64_t nrows_x = ggml_nrows(src0);
6893
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6817
6894
 
6818
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6895
+ float scale = 1.0f;
6896
+ memcpy(&scale, dst->op_params, sizeof(float));
6897
+
6898
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6819
6899
 
6820
- (void) src1;
6821
6900
  (void) dst;
6822
- (void) src1_dd;
6823
6901
  }
6824
6902
 
6825
6903
  inline void ggml_cuda_op_scale(
@@ -6989,7 +7067,7 @@ static void ggml_cuda_op_mul_mat(
6989
7067
  const int64_t ne01 = src0->ne[1];
6990
7068
  const int64_t ne02 = src0->ne[2];
6991
7069
  const int64_t ne03 = src0->ne[3];
6992
- const int64_t nrows0 = ggml_nrows(src0);
7070
+ // const int64_t nrows0 = ggml_nrows(src0);
6993
7071
 
6994
7072
  const int64_t ne10 = src1->ne[0];
6995
7073
  const int64_t ne11 = src1->ne[1];
@@ -7090,7 +7168,7 @@ static void ggml_cuda_op_mul_mat(
7090
7168
  if (src0_on_device && src0_is_contiguous) {
7091
7169
  src0_dd[id] = (char *) src0_extra->data_device[id];
7092
7170
  } else {
7093
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7171
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7094
7172
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7095
7173
  }
7096
7174
 
@@ -7323,7 +7401,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7323
7401
  }
7324
7402
 
7325
7403
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
- if (!g_cublas_loaded) return false;
7404
+ if (!g_cublas_loaded) { return false; }
7327
7405
 
7328
7406
  const int64_t ne10 = src1->ne[0];
7329
7407
 
@@ -7401,7 +7479,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7401
7479
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7402
7480
  }
7403
7481
 
7404
- __global__ void k_compute_batched_ptrs(
7482
+ __global__ static void k_compute_batched_ptrs(
7405
7483
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7406
7484
  const void ** ptrs_src, void ** ptrs_dst,
7407
7485
  int ne12, int ne13,
@@ -8017,7 +8095,7 @@ void ggml_cuda_free_scratch() {
8017
8095
  }
8018
8096
 
8019
8097
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
- if (!g_cublas_loaded) return false;
8098
+ if (!g_cublas_loaded) { return false; }
8021
8099
 
8022
8100
  ggml_cuda_func_t func;
8023
8101
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8031,7 +8109,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8031
8109
  if (tensor->op == GGML_OP_MUL_MAT) {
8032
8110
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
8111
  #ifndef NDEBUG
8034
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8112
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
8113
  #endif
8036
8114
  return false;
8037
8115
  }
@@ -8316,14 +8394,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8316
8394
  UNUSED(cgraph);
8317
8395
  }
8318
8396
 
8319
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8397
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8320
8398
  GGML_ASSERT(!"not implemented");
8321
8399
 
8322
8400
  UNUSED(backend);
8323
8401
  UNUSED(plan);
8324
8402
  }
8325
8403
 
8326
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8404
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8327
8405
  GGML_ASSERT(!"not implemented");
8328
8406
 
8329
8407
  UNUSED(backend);
@@ -8339,8 +8417,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8339
8417
  for (int i = 0; i < cgraph->n_nodes; i++) {
8340
8418
  ggml_tensor * node = cgraph->nodes[i];
8341
8419
 
8342
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8420
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8343
8421
  continue;
8422
+ }
8344
8423
  assert(node->backend == GGML_BACKEND_GPU);
8345
8424
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8346
8425
  if (node->src[j] != nullptr) {