llama_cpp 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
235
236
  #endif //GGML_CUDA_F16
236
237
 
237
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
238
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
240
 
240
241
  int x32 = 0;
241
242
  x32 |= x16[0] << 0;
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
245
246
  }
246
247
 
247
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
248
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
250
 
250
251
  int x32 = 0;
251
252
  x32 |= x16[0] << 0;
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
255
256
  }
256
257
 
257
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
258
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
260
  }
260
261
 
261
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
262
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
264
  }
264
265
 
265
266
  template<typename T>
@@ -442,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
442
443
  #define CUDA_SCALE_BLOCK_SIZE 256
443
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
444
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
445
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
446
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
447
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -469,7 +471,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
469
471
  #define MUL_MAT_SRC1_COL_STRIDE 128
470
472
 
471
473
  #define MAX_STREAMS 8
472
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
474
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
473
475
 
474
476
  struct ggml_tensor_extra_gpu {
475
477
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -500,6 +502,31 @@ static size_t g_scratch_offset = 0;
500
502
 
501
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
502
504
 
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
+ }
510
+ return x;
511
+ }
512
+
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
521
+
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
526
+ }
527
+ return x;
528
+ }
529
+
503
530
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
504
531
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
505
532
 
@@ -576,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
576
603
  dst[i] = x[i] * x[i];
577
604
  }
578
605
 
579
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
580
- #pragma unroll
581
- for (int mask = 16; mask > 0; mask >>= 1) {
582
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
583
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
584
- }
585
- return a;
586
- }
587
-
588
606
  template <int block_size>
589
607
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
590
608
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -623,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
623
641
  }
624
642
  }
625
643
 
626
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
627
- #pragma unroll
628
- for (int mask = 16; mask > 0; mask >>= 1) {
629
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
630
- }
631
- return x;
632
- }
633
-
634
644
  template <int block_size>
635
645
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
636
646
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -2248,6 +2258,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2248
2258
  }
2249
2259
 
2250
2260
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2261
+ (void)x_qh; (void)x_sc;
2251
2262
 
2252
2263
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2253
2264
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2259,7 +2270,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2259
2270
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2260
2271
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2261
2272
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2262
-
2273
+ (void)x_qh; (void)x_sc;
2263
2274
  GGML_CUDA_ASSUME(i_offset >= 0);
2264
2275
  GGML_CUDA_ASSUME(i_offset < nwarps);
2265
2276
  GGML_CUDA_ASSUME(k >= 0);
@@ -2268,7 +2279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2268
2279
  const int kbx = k / QI4_0;
2269
2280
  const int kqsx = k % QI4_0;
2270
2281
 
2271
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2282
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2272
2283
 
2273
2284
  float * x_dmf = (float *) x_dm;
2274
2285
 
@@ -2306,9 +2317,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2306
2317
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2307
2318
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2308
2319
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2320
+ (void)x_qh; (void)x_sc;
2309
2321
 
2310
2322
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2311
- const float * x_dmf = (float *) x_dm;
2323
+ const float * x_dmf = (const float *) x_dm;
2312
2324
 
2313
2325
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2314
2326
 
@@ -2342,6 +2354,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2342
2354
  }
2343
2355
 
2344
2356
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2357
+ (void)x_qh; (void)x_sc;
2345
2358
 
2346
2359
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2347
2360
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2353,6 +2366,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2353
2366
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2354
2367
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2355
2368
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2369
+ (void)x_qh; (void)x_sc;
2356
2370
 
2357
2371
  GGML_CUDA_ASSUME(i_offset >= 0);
2358
2372
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2362,7 +2376,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2362
2376
  const int kbx = k / QI4_1;
2363
2377
  const int kqsx = k % QI4_1;
2364
2378
 
2365
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2379
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2366
2380
 
2367
2381
  #pragma unroll
2368
2382
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2397,6 +2411,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2397
2411
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2398
2412
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2399
2413
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2414
+ (void)x_qh; (void)x_sc;
2400
2415
 
2401
2416
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2402
2417
 
@@ -2434,6 +2449,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2434
2449
  }
2435
2450
 
2436
2451
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2452
+ (void)x_qh; (void)x_sc;
2437
2453
 
2438
2454
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2439
2455
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2445,6 +2461,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2445
2461
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2446
2462
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2447
2463
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2464
+ (void)x_qh; (void)x_sc;
2448
2465
 
2449
2466
  GGML_CUDA_ASSUME(i_offset >= 0);
2450
2467
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2454,7 +2471,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2454
2471
  const int kbx = k / QI5_0;
2455
2472
  const int kqsx = k % QI5_0;
2456
2473
 
2457
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2474
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2458
2475
 
2459
2476
  #pragma unroll
2460
2477
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2509,6 +2526,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2509
2526
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2510
2527
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2511
2528
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2529
+ (void)x_qh; (void)x_sc;
2512
2530
 
2513
2531
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2514
2532
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2548,6 +2566,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2548
2566
  }
2549
2567
 
2550
2568
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2569
+ (void)x_qh; (void)x_sc;
2551
2570
 
2552
2571
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2553
2572
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2559,6 +2578,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2559
2578
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2560
2579
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2561
2580
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2581
+ (void)x_qh; (void)x_sc;
2562
2582
 
2563
2583
  GGML_CUDA_ASSUME(i_offset >= 0);
2564
2584
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2568,7 +2588,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2568
2588
  const int kbx = k / QI5_1;
2569
2589
  const int kqsx = k % QI5_1;
2570
2590
 
2571
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2591
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2572
2592
 
2573
2593
  #pragma unroll
2574
2594
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2620,6 +2640,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2620
2640
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2621
2641
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2622
2642
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2643
+ (void)x_qh; (void)x_sc;
2623
2644
 
2624
2645
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2625
2646
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2654,6 +2675,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2654
2675
  }
2655
2676
 
2656
2677
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2678
+ (void)x_qh; (void)x_sc;
2657
2679
 
2658
2680
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2659
2681
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2665,6 +2687,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2665
2687
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2666
2688
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2667
2689
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2690
+ (void)x_qh; (void)x_sc;
2668
2691
 
2669
2692
  GGML_CUDA_ASSUME(i_offset >= 0);
2670
2693
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2675,7 +2698,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2675
2698
  const int kqsx = k % QI8_0;
2676
2699
  float * x_dmf = (float *) x_dm;
2677
2700
 
2678
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2701
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2679
2702
 
2680
2703
  #pragma unroll
2681
2704
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2710,6 +2733,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2710
2733
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2711
2734
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2712
2735
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2736
+ (void)x_qh; (void)x_sc;
2713
2737
 
2714
2738
  const float * x_dmf = (const float *) x_dm;
2715
2739
  const float * y_df = (const float *) y_ds;
@@ -2743,6 +2767,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2743
2767
  }
2744
2768
 
2745
2769
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2770
+ (void)x_qh;
2746
2771
 
2747
2772
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2748
2773
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2756,6 +2781,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2756
2781
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2757
2782
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2758
2783
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
+ (void)x_qh;
2759
2785
 
2760
2786
  GGML_CUDA_ASSUME(i_offset >= 0);
2761
2787
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2765,7 +2791,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2765
2791
  const int kbx = k / QI2_K;
2766
2792
  const int kqsx = k % QI2_K;
2767
2793
 
2768
- const block_q2_K * bx0 = (block_q2_K *) vx;
2794
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2769
2795
 
2770
2796
  #pragma unroll
2771
2797
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2813,6 +2839,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2813
2839
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2814
2840
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2815
2841
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2842
+ (void)x_qh;
2816
2843
 
2817
2844
  const int kbx = k / QI2_K;
2818
2845
  const int ky = (k % QI2_K) * QR2_K;
@@ -2886,7 +2913,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2886
2913
  const int kbx = k / QI3_K;
2887
2914
  const int kqsx = k % QI3_K;
2888
2915
 
2889
- const block_q3_K * bx0 = (block_q3_K *) vx;
2916
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2890
2917
 
2891
2918
  #pragma unroll
2892
2919
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2967,7 +2994,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2967
2994
  const float * x_dmf = (const float *) x_dm;
2968
2995
  const float * y_df = (const float *) y_ds;
2969
2996
 
2970
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2997
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2971
2998
 
2972
2999
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2973
3000
 
@@ -3082,6 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3082
3109
  }
3083
3110
 
3084
3111
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3112
+ (void)x_qh;
3085
3113
 
3086
3114
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3087
3115
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3095,6 +3123,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3095
3123
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3096
3124
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3097
3125
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3126
+ (void)x_qh;
3098
3127
 
3099
3128
  GGML_CUDA_ASSUME(i_offset >= 0);
3100
3129
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3104,7 +3133,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3104
3133
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3105
3134
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3106
3135
 
3107
- const block_q4_K * bx0 = (block_q4_K *) vx;
3136
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3108
3137
 
3109
3138
  #pragma unroll
3110
3139
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3149,7 +3178,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3149
3178
 
3150
3179
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3151
3180
 
3152
- const int * scales = (int *) bxi->scales;
3181
+ const int * scales = (const int *) bxi->scales;
3153
3182
 
3154
3183
  const int ksc = k % (WARP_SIZE/8);
3155
3184
 
@@ -3164,6 +3193,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3164
3193
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3165
3194
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3166
3195
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3196
+ (void)x_qh;
3167
3197
 
3168
3198
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3169
3199
 
@@ -3263,6 +3293,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3263
3293
  }
3264
3294
 
3265
3295
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3296
+ (void)x_qh;
3266
3297
 
3267
3298
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3268
3299
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3276,6 +3307,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3276
3307
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3277
3308
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3278
3309
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3310
+ (void)x_qh;
3279
3311
 
3280
3312
  GGML_CUDA_ASSUME(i_offset >= 0);
3281
3313
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3285,7 +3317,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3285
3317
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3286
3318
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3287
3319
 
3288
- const block_q5_K * bx0 = (block_q5_K *) vx;
3320
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3289
3321
 
3290
3322
  #pragma unroll
3291
3323
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3341,7 +3373,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3341
3373
 
3342
3374
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3343
3375
 
3344
- const int * scales = (int *) bxi->scales;
3376
+ const int * scales = (const int *) bxi->scales;
3345
3377
 
3346
3378
  const int ksc = k % (WARP_SIZE/8);
3347
3379
 
@@ -3356,6 +3388,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3356
3388
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3357
3389
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3358
3390
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3391
+ (void)x_qh;
3359
3392
 
3360
3393
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3361
3394
 
@@ -3392,6 +3425,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3392
3425
  }
3393
3426
 
3394
3427
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3428
+ (void)x_qh;
3395
3429
 
3396
3430
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3397
3431
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3405,6 +3439,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3405
3439
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3406
3440
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3407
3441
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3442
+ (void)x_qh;
3408
3443
 
3409
3444
  GGML_CUDA_ASSUME(i_offset >= 0);
3410
3445
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3414,7 +3449,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3414
3449
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3415
3450
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3416
3451
 
3417
- const block_q6_K * bx0 = (block_q6_K *) vx;
3452
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3418
3453
 
3419
3454
  #pragma unroll
3420
3455
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3476,6 +3511,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3476
3511
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3477
3512
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3478
3513
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3514
+ (void)x_qh;
3479
3515
 
3480
3516
  const float * x_dmf = (const float *) x_dm;
3481
3517
  const float * y_df = (const float *) y_ds;
@@ -3518,7 +3554,7 @@ static __device__ __forceinline__ void mul_mat_q(
3518
3554
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3519
3555
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3520
3556
 
3521
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3557
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3522
3558
 
3523
3559
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3524
3560
 
@@ -4583,8 +4619,8 @@ static __global__ void rope(
4583
4619
 
4584
4620
  template<typename T, bool has_pos>
4585
4621
  static __global__ void rope_neox(
4586
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4622
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4623
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4588
4624
  ) {
4589
4625
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4590
4626
 
@@ -4593,23 +4629,25 @@ static __global__ void rope_neox(
4593
4629
  }
4594
4630
 
4595
4631
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4596
- const int i = row*ncols + col/2;
4632
+ const int ib = col / n_dims;
4633
+ const int ic = col % n_dims;
4634
+
4635
+ const int i = row*ncols + ib*n_dims + ic/2;
4597
4636
  const int i2 = row/p_delta_rows;
4598
4637
 
4599
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4600
- const float cur_rot = -float(col)/ncols;
4638
+ float cur_rot = inv_ndims * ic - ib;
4601
4639
 
4602
4640
  const int p = has_pos ? pos[i2] : 0;
4603
- const float theta_base = p*powf(freq_base, cur_rot);
4641
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4604
4642
 
4605
4643
  float cos_theta, sin_theta;
4606
4644
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4607
4645
 
4608
4646
  const float x0 = x[i + 0];
4609
- const float x1 = x[i + ncols/2];
4647
+ const float x1 = x[i + n_dims/2];
4610
4648
 
4611
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4612
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4649
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4650
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4613
4651
  }
4614
4652
 
4615
4653
  static __global__ void rope_glm_f32(
@@ -4688,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4688
4726
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4689
4727
  }
4690
4728
 
4691
- // the CUDA soft max implementation differs from the CPU implementation
4692
- // instead of doubles floats are used
4693
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4694
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4695
- const int block_size = blockDim.y;
4696
- const int tid = threadIdx.y;
4729
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4730
+ const int tid = threadIdx.x;
4731
+ const int rowx = blockIdx.x;
4732
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4733
+
4734
+ const int block_size = blockDim.x;
4735
+
4736
+ const int warp_id = threadIdx.x / WARP_SIZE;
4737
+ const int lane_id = threadIdx.x % WARP_SIZE;
4738
+
4739
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4697
4740
 
4698
4741
  float max_val = -INFINITY;
4699
4742
 
4700
4743
  for (int col = tid; col < ncols; col += block_size) {
4701
- const int i = row*ncols + col;
4702
- max_val = max(max_val, x[i]);
4744
+ const int ix = rowx*ncols + col;
4745
+ const int iy = rowy*ncols + col;
4746
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4703
4747
  }
4704
4748
 
4705
4749
  // find the max value in the block
4706
- #pragma unroll
4707
- for (int mask = 16; mask > 0; mask >>= 1) {
4708
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4750
+ max_val = warp_reduce_max(max_val);
4751
+ if (block_size > WARP_SIZE) {
4752
+ if (warp_id == 0) {
4753
+ buf[lane_id] = -INFINITY;
4754
+ }
4755
+ __syncthreads();
4756
+
4757
+ if (lane_id == 0) {
4758
+ buf[warp_id] = max_val;
4759
+ }
4760
+ __syncthreads();
4761
+
4762
+ max_val = buf[lane_id];
4763
+ max_val = warp_reduce_max(max_val);
4709
4764
  }
4710
4765
 
4711
4766
  float tmp = 0.f;
4712
4767
 
4713
4768
  for (int col = tid; col < ncols; col += block_size) {
4714
- const int i = row*ncols + col;
4715
- const float val = expf(x[i] - max_val);
4769
+ const int ix = rowx*ncols + col;
4770
+ const int iy = rowy*ncols + col;
4771
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4716
4772
  tmp += val;
4717
- dst[i] = val;
4773
+ dst[ix] = val;
4718
4774
  }
4719
4775
 
4720
- // sum up partial sums
4721
- #pragma unroll
4722
- for (int mask = 16; mask > 0; mask >>= 1) {
4723
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4776
+ // find the sum of exps in the block
4777
+ tmp = warp_reduce_sum(tmp);
4778
+ if (block_size > WARP_SIZE) {
4779
+ if (warp_id == 0) {
4780
+ buf[lane_id] = 0.f;
4781
+ }
4782
+ __syncthreads();
4783
+
4784
+ if (lane_id == 0) {
4785
+ buf[warp_id] = tmp;
4786
+ }
4787
+ __syncthreads();
4788
+
4789
+ tmp = buf[lane_id];
4790
+ tmp = warp_reduce_sum(tmp);
4724
4791
  }
4725
4792
 
4726
4793
  const float inv_tmp = 1.f / tmp;
4727
4794
 
4728
4795
  for (int col = tid; col < ncols; col += block_size) {
4729
- const int i = row*ncols + col;
4796
+ const int i = rowx*ncols + col;
4730
4797
  dst[i] *= inv_tmp;
4731
4798
  }
4732
4799
  }
@@ -5712,20 +5779,26 @@ static void rope_cuda(
5712
5779
 
5713
5780
  template<typename T>
5714
5781
  static void rope_neox_cuda(
5715
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5782
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5716
5783
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5717
5784
  ) {
5718
5785
  GGML_ASSERT(ncols % 2 == 0);
5719
5786
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5720
5787
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5721
5788
  const dim3 block_nums(nrows, num_blocks_x, 1);
5789
+
5790
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
5791
+ const float inv_ndims = -1.0f / n_dims;
5792
+
5722
5793
  if (pos == nullptr) {
5723
5794
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5724
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5795
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5796
+ theta_scale, inv_ndims
5725
5797
  );
5726
5798
  } else {
5727
5799
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5728
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5800
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5801
+ theta_scale, inv_ndims
5729
5802
  );
5730
5803
  }
5731
5804
  }
@@ -5757,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5757
5830
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5758
5831
  }
5759
5832
 
5760
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5761
- const dim3 block_dims(1, WARP_SIZE, 1);
5833
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
5834
+ int nth = WARP_SIZE;
5835
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
5836
+ const dim3 block_dims(nth, 1, 1);
5762
5837
  const dim3 block_nums(nrows_x, 1, 1);
5763
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5838
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5764
5839
  }
5765
5840
 
5766
5841
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -6023,18 +6098,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6023
6098
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6024
6099
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6025
6100
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6026
- } else if (nb0 == ts) {
6101
+ }
6102
+ if (nb0 == ts) {
6027
6103
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6028
- } else {
6029
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6030
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6031
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6032
- // pretend the row is a matrix with cols=1
6033
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6034
- if (r != cudaSuccess) return r;
6035
- }
6036
- return cudaSuccess;
6037
6104
  }
6105
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6106
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6107
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6108
+ // pretend the row is a matrix with cols=1
6109
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6110
+ if (r != cudaSuccess) { return r; }
6111
+ }
6112
+ return cudaSuccess;
6038
6113
  }
6039
6114
 
6040
6115
  static void ggml_cuda_op_repeat(
@@ -6680,15 +6755,14 @@ inline void ggml_cuda_op_rope(
6680
6755
  GGML_ASSERT(false);
6681
6756
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6682
6757
  } else if (is_neox) {
6683
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6684
6758
  if (src0->type == GGML_TYPE_F32) {
6685
6759
  rope_neox_cuda(
6686
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6760
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6687
6761
  attn_factor, corr_dims, main_stream
6688
6762
  );
6689
6763
  } else if (src0->type == GGML_TYPE_F16) {
6690
6764
  rope_neox_cuda(
6691
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6765
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6692
6766
  attn_factor, corr_dims, main_stream
6693
6767
  );
6694
6768
  } else {
@@ -6812,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
6812
6886
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6813
6887
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6814
6888
 
6889
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
6890
+
6815
6891
  const int64_t ne00 = src0->ne[0];
6816
- const int64_t nrows = ggml_nrows(src0);
6892
+ const int64_t nrows_x = ggml_nrows(src0);
6893
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6817
6894
 
6818
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6895
+ float scale = 1.0f;
6896
+ memcpy(&scale, dst->op_params, sizeof(float));
6897
+
6898
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6819
6899
 
6820
- (void) src1;
6821
6900
  (void) dst;
6822
- (void) src1_dd;
6823
6901
  }
6824
6902
 
6825
6903
  inline void ggml_cuda_op_scale(
@@ -6989,7 +7067,7 @@ static void ggml_cuda_op_mul_mat(
6989
7067
  const int64_t ne01 = src0->ne[1];
6990
7068
  const int64_t ne02 = src0->ne[2];
6991
7069
  const int64_t ne03 = src0->ne[3];
6992
- const int64_t nrows0 = ggml_nrows(src0);
7070
+ // const int64_t nrows0 = ggml_nrows(src0);
6993
7071
 
6994
7072
  const int64_t ne10 = src1->ne[0];
6995
7073
  const int64_t ne11 = src1->ne[1];
@@ -7090,7 +7168,7 @@ static void ggml_cuda_op_mul_mat(
7090
7168
  if (src0_on_device && src0_is_contiguous) {
7091
7169
  src0_dd[id] = (char *) src0_extra->data_device[id];
7092
7170
  } else {
7093
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7171
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7094
7172
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7095
7173
  }
7096
7174
 
@@ -7323,7 +7401,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7323
7401
  }
7324
7402
 
7325
7403
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
- if (!g_cublas_loaded) return false;
7404
+ if (!g_cublas_loaded) { return false; }
7327
7405
 
7328
7406
  const int64_t ne10 = src1->ne[0];
7329
7407
 
@@ -7401,7 +7479,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7401
7479
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7402
7480
  }
7403
7481
 
7404
- __global__ void k_compute_batched_ptrs(
7482
+ __global__ static void k_compute_batched_ptrs(
7405
7483
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7406
7484
  const void ** ptrs_src, void ** ptrs_dst,
7407
7485
  int ne12, int ne13,
@@ -8017,7 +8095,7 @@ void ggml_cuda_free_scratch() {
8017
8095
  }
8018
8096
 
8019
8097
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
- if (!g_cublas_loaded) return false;
8098
+ if (!g_cublas_loaded) { return false; }
8021
8099
 
8022
8100
  ggml_cuda_func_t func;
8023
8101
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8031,7 +8109,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8031
8109
  if (tensor->op == GGML_OP_MUL_MAT) {
8032
8110
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
8111
  #ifndef NDEBUG
8034
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8112
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
8113
  #endif
8036
8114
  return false;
8037
8115
  }
@@ -8316,14 +8394,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8316
8394
  UNUSED(cgraph);
8317
8395
  }
8318
8396
 
8319
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8397
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8320
8398
  GGML_ASSERT(!"not implemented");
8321
8399
 
8322
8400
  UNUSED(backend);
8323
8401
  UNUSED(plan);
8324
8402
  }
8325
8403
 
8326
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8404
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8327
8405
  GGML_ASSERT(!"not implemented");
8328
8406
 
8329
8407
  UNUSED(backend);
@@ -8339,8 +8417,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8339
8417
  for (int i = 0; i < cgraph->n_nodes; i++) {
8340
8418
  ggml_tensor * node = cgraph->nodes[i];
8341
8419
 
8342
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8420
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8343
8421
  continue;
8422
+ }
8344
8423
  assert(node->backend == GGML_BACKEND_GPU);
8345
8424
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8346
8425
  if (node->src[j] != nullptr) {