llama_cpp 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -81,12 +82,15 @@
81
82
 
82
83
  #include "ggml-cuda.h"
83
84
  #include "ggml.h"
85
+ #include "ggml-backend-impl.h"
84
86
 
85
87
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
88
  #define CC_VOLTA 700
87
89
  #define CC_OFFSET_AMD 1000000
88
90
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
91
 
92
+ #define GGML_CUDA_MAX_NODES 8192
93
+
90
94
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
95
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
96
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -232,7 +236,7 @@ typedef float2 dfloat2;
232
236
  #endif //GGML_CUDA_F16
233
237
 
234
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
235
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
236
240
 
237
241
  int x32 = 0;
238
242
  x32 |= x16[0] << 0;
@@ -242,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
242
246
  }
243
247
 
244
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
245
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
246
250
 
247
251
  int x32 = 0;
248
252
  x32 |= x16[0] << 0;
@@ -252,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
252
256
  }
253
257
 
254
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
255
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
256
260
  }
257
261
 
258
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
259
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
260
264
  }
261
265
 
262
266
  template<typename T>
@@ -433,6 +437,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
437
  #define CUDA_MUL_BLOCK_SIZE 256
434
438
  #define CUDA_GELU_BLOCK_SIZE 256
435
439
  #define CUDA_SILU_BLOCK_SIZE 256
440
+ #define CUDA_RELU_BLOCK_SIZE 256
441
+ #define CUDA_SQR_BLOCK_SIZE 256
436
442
  #define CUDA_CPY_BLOCK_SIZE 32
437
443
  #define CUDA_SCALE_BLOCK_SIZE 256
438
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -464,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
464
470
  #define MUL_MAT_SRC1_COL_STRIDE 128
465
471
 
466
472
  #define MAX_STREAMS 8
467
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
473
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
468
474
 
469
475
  struct ggml_tensor_extra_gpu {
470
476
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -553,6 +559,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
559
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
560
  }
555
561
 
562
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
563
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
564
+
565
+ if (i >= k) {
566
+ return;
567
+ }
568
+ dst[i] = fmaxf(x[i], 0);
569
+ }
570
+
571
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
572
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
573
+
574
+ if (i >= k) {
575
+ return;
576
+ }
577
+ dst[i] = x[i] * x[i];
578
+ }
579
+
556
580
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
581
  #pragma unroll
558
582
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -2225,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2225
2249
  }
2226
2250
 
2227
2251
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2252
+ (void)x_qh; (void)x_sc;
2228
2253
 
2229
2254
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2230
2255
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2236,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2236
2261
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2237
2262
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2238
2263
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2239
-
2264
+ (void)x_qh; (void)x_sc;
2240
2265
  GGML_CUDA_ASSUME(i_offset >= 0);
2241
2266
  GGML_CUDA_ASSUME(i_offset < nwarps);
2242
2267
  GGML_CUDA_ASSUME(k >= 0);
@@ -2245,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2245
2270
  const int kbx = k / QI4_0;
2246
2271
  const int kqsx = k % QI4_0;
2247
2272
 
2248
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2273
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2249
2274
 
2250
2275
  float * x_dmf = (float *) x_dm;
2251
2276
 
@@ -2283,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2283
2308
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2284
2309
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2285
2310
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2311
+ (void)x_qh; (void)x_sc;
2286
2312
 
2287
2313
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2288
- const float * x_dmf = (float *) x_dm;
2314
+ const float * x_dmf = (const float *) x_dm;
2289
2315
 
2290
2316
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2291
2317
 
@@ -2319,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2319
2345
  }
2320
2346
 
2321
2347
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2348
+ (void)x_qh; (void)x_sc;
2322
2349
 
2323
2350
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2324
2351
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2330,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2330
2357
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2331
2358
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2359
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2360
+ (void)x_qh; (void)x_sc;
2333
2361
 
2334
2362
  GGML_CUDA_ASSUME(i_offset >= 0);
2335
2363
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2339,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2339
2367
  const int kbx = k / QI4_1;
2340
2368
  const int kqsx = k % QI4_1;
2341
2369
 
2342
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2370
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2343
2371
 
2344
2372
  #pragma unroll
2345
2373
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2374,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2374
2402
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2375
2403
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2404
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2405
+ (void)x_qh; (void)x_sc;
2377
2406
 
2378
2407
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2379
2408
 
@@ -2411,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2411
2440
  }
2412
2441
 
2413
2442
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2443
+ (void)x_qh; (void)x_sc;
2414
2444
 
2415
2445
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2416
2446
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2422,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2422
2452
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2423
2453
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2424
2454
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2455
+ (void)x_qh; (void)x_sc;
2425
2456
 
2426
2457
  GGML_CUDA_ASSUME(i_offset >= 0);
2427
2458
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2431,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2431
2462
  const int kbx = k / QI5_0;
2432
2463
  const int kqsx = k % QI5_0;
2433
2464
 
2434
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2465
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2435
2466
 
2436
2467
  #pragma unroll
2437
2468
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2486,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2486
2517
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2487
2518
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2488
2519
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2520
+ (void)x_qh; (void)x_sc;
2489
2521
 
2490
2522
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2491
2523
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2525,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2525
2557
  }
2526
2558
 
2527
2559
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2560
+ (void)x_qh; (void)x_sc;
2528
2561
 
2529
2562
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2530
2563
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2536,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2536
2569
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2537
2570
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2538
2571
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2572
+ (void)x_qh; (void)x_sc;
2539
2573
 
2540
2574
  GGML_CUDA_ASSUME(i_offset >= 0);
2541
2575
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2545,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2545
2579
  const int kbx = k / QI5_1;
2546
2580
  const int kqsx = k % QI5_1;
2547
2581
 
2548
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2582
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2549
2583
 
2550
2584
  #pragma unroll
2551
2585
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2597,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2597
2631
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2598
2632
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2599
2633
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2634
+ (void)x_qh; (void)x_sc;
2600
2635
 
2601
2636
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2602
2637
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2631,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2631
2666
  }
2632
2667
 
2633
2668
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2669
+ (void)x_qh; (void)x_sc;
2634
2670
 
2635
2671
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2636
2672
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2642,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2642
2678
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2643
2679
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2644
2680
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2681
+ (void)x_qh; (void)x_sc;
2645
2682
 
2646
2683
  GGML_CUDA_ASSUME(i_offset >= 0);
2647
2684
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2652,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2652
2689
  const int kqsx = k % QI8_0;
2653
2690
  float * x_dmf = (float *) x_dm;
2654
2691
 
2655
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2692
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2656
2693
 
2657
2694
  #pragma unroll
2658
2695
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2687,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2687
2724
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2688
2725
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2689
2726
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2727
+ (void)x_qh; (void)x_sc;
2690
2728
 
2691
2729
  const float * x_dmf = (const float *) x_dm;
2692
2730
  const float * y_df = (const float *) y_ds;
@@ -2720,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2720
2758
  }
2721
2759
 
2722
2760
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2761
+ (void)x_qh;
2723
2762
 
2724
2763
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2725
2764
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2733,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2733
2772
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2734
2773
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2735
2774
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2775
+ (void)x_qh;
2736
2776
 
2737
2777
  GGML_CUDA_ASSUME(i_offset >= 0);
2738
2778
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2742,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2742
2782
  const int kbx = k / QI2_K;
2743
2783
  const int kqsx = k % QI2_K;
2744
2784
 
2745
- const block_q2_K * bx0 = (block_q2_K *) vx;
2785
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2746
2786
 
2747
2787
  #pragma unroll
2748
2788
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2790,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2790
2830
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2791
2831
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2792
2832
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2833
+ (void)x_qh;
2793
2834
 
2794
2835
  const int kbx = k / QI2_K;
2795
2836
  const int ky = (k % QI2_K) * QR2_K;
@@ -2863,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2863
2904
  const int kbx = k / QI3_K;
2864
2905
  const int kqsx = k % QI3_K;
2865
2906
 
2866
- const block_q3_K * bx0 = (block_q3_K *) vx;
2907
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2867
2908
 
2868
2909
  #pragma unroll
2869
2910
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2944,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2944
2985
  const float * x_dmf = (const float *) x_dm;
2945
2986
  const float * y_df = (const float *) y_ds;
2946
2987
 
2947
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2988
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2948
2989
 
2949
2990
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2950
2991
 
@@ -3059,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3059
3100
  }
3060
3101
 
3061
3102
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3103
+ (void)x_qh;
3062
3104
 
3063
3105
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3064
3106
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3072,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3072
3114
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3073
3115
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3074
3116
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3117
+ (void)x_qh;
3075
3118
 
3076
3119
  GGML_CUDA_ASSUME(i_offset >= 0);
3077
3120
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3081,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3081
3124
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3082
3125
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3083
3126
 
3084
- const block_q4_K * bx0 = (block_q4_K *) vx;
3127
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3085
3128
 
3086
3129
  #pragma unroll
3087
3130
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3126,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3126
3169
 
3127
3170
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3128
3171
 
3129
- const int * scales = (int *) bxi->scales;
3172
+ const int * scales = (const int *) bxi->scales;
3130
3173
 
3131
3174
  const int ksc = k % (WARP_SIZE/8);
3132
3175
 
@@ -3141,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3141
3184
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3142
3185
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3143
3186
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3187
+ (void)x_qh;
3144
3188
 
3145
3189
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3146
3190
 
@@ -3240,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3240
3284
  }
3241
3285
 
3242
3286
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3287
+ (void)x_qh;
3243
3288
 
3244
3289
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3245
3290
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3253,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3253
3298
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3254
3299
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3255
3300
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3301
+ (void)x_qh;
3256
3302
 
3257
3303
  GGML_CUDA_ASSUME(i_offset >= 0);
3258
3304
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3262,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3262
3308
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3263
3309
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3264
3310
 
3265
- const block_q5_K * bx0 = (block_q5_K *) vx;
3311
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3266
3312
 
3267
3313
  #pragma unroll
3268
3314
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3318,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3318
3364
 
3319
3365
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3320
3366
 
3321
- const int * scales = (int *) bxi->scales;
3367
+ const int * scales = (const int *) bxi->scales;
3322
3368
 
3323
3369
  const int ksc = k % (WARP_SIZE/8);
3324
3370
 
@@ -3333,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3333
3379
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3334
3380
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3335
3381
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3382
+ (void)x_qh;
3336
3383
 
3337
3384
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3338
3385
 
@@ -3369,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3369
3416
  }
3370
3417
 
3371
3418
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3419
+ (void)x_qh;
3372
3420
 
3373
3421
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3374
3422
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3382,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3382
3430
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3383
3431
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3384
3432
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3433
+ (void)x_qh;
3385
3434
 
3386
3435
  GGML_CUDA_ASSUME(i_offset >= 0);
3387
3436
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3391,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3391
3440
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3392
3441
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3393
3442
 
3394
- const block_q6_K * bx0 = (block_q6_K *) vx;
3443
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3395
3444
 
3396
3445
  #pragma unroll
3397
3446
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3453,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3453
3502
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3454
3503
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3455
3504
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3505
+ (void)x_qh;
3456
3506
 
3457
3507
  const float * x_dmf = (const float *) x_dm;
3458
3508
  const float * y_df = (const float *) y_ds;
@@ -3495,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
3495
3545
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3496
3546
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3497
3547
 
3498
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3548
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3499
3549
 
3500
3550
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3501
3551
 
@@ -4468,6 +4518,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4518
  *dsti = __float2half(*xi);
4469
4519
  }
4470
4520
 
4521
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4522
+ const half * xi = (const half *) cxi;
4523
+ half * dsti = (half *) cdsti;
4524
+
4525
+ *dsti = *xi;
4526
+ }
4527
+
4471
4528
  template <cpy_kernel_t cpy_1>
4472
4529
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4530
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4778,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4778
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4779
  }
4723
4780
 
4781
+ static __global__ void im2col_f32_f16(
4782
+ const float * x, half * dst,
4783
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4784
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4785
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4786
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4787
+
4788
+ const int offset_dst =
4789
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4790
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4791
+
4792
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4793
+ dst[offset_dst] = __float2half(0.0f);
4794
+ } else {
4795
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4796
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4797
+ }
4798
+ }
4799
+
4724
4800
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4801
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4802
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4835,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4835
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4836
  }
4761
4837
 
4838
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4839
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4840
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4841
+ }
4842
+
4843
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4844
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4845
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4846
+ }
4847
+
4762
4848
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4849
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4850
  if (ncols < 1024) {
@@ -5611,6 +5697,16 @@ static void ggml_cpy_f32_f16_cuda(
5611
5697
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5612
5698
  }
5613
5699
 
5700
+ static void ggml_cpy_f16_f16_cuda(
5701
+ const char * cx, char * cdst, const int ne,
5702
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5703
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5704
+
5705
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5706
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5707
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5708
+ }
5709
+
5614
5710
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5615
5711
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5616
5712
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5694,6 +5790,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5694
5790
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5695
5791
  }
5696
5792
 
5793
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5794
+ int OH, int IW, int IH, int OW, int IC,
5795
+ int KH, int KW, int N, int ofs0, int ofs1,
5796
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5797
+ dim3 block_nums(IC, OH, OW);
5798
+ dim3 block_dims(N, KH, KW);
5799
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5800
+ }
5801
+
5697
5802
  // buffer pool for cuda
5698
5803
  #define MAX_CUDA_BUFFERS 256
5699
5804
 
@@ -5762,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5762
5867
  return ptr;
5763
5868
  }
5764
5869
  #ifdef DEBUG_CUDA_MALLOC
5765
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5870
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5766
5871
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5767
5872
  #endif
5768
5873
  void * ptr;
@@ -5900,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5900
6005
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5901
6006
  // This can fixed the OOM error in WSL.
5902
6007
  cudaGetLastError();
5903
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6008
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5904
6009
  size/1024.0/1024.0, cudaGetErrorString(err));
5905
6010
  return nullptr;
5906
6011
  }
@@ -5945,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5945
6050
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
5946
6051
  if (nb0 == ts && nb1 == ts*ne0/bs) {
5947
6052
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
5948
- } else if (nb0 == ts) {
6053
+ }
6054
+ if (nb0 == ts) {
5949
6055
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
5950
- } else {
5951
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
5952
- const void * rx = (const void *) ((const char *) x + i1*nb1);
5953
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
5954
- // pretend the row is a matrix with cols=1
5955
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
5956
- if (r != cudaSuccess) return r;
5957
- }
5958
- return cudaSuccess;
5959
6056
  }
6057
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
+ // pretend the row is a matrix with cols=1
6061
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
+ if (r != cudaSuccess) { return r; }
6063
+ }
6064
+ return cudaSuccess;
5960
6065
  }
5961
6066
 
5962
6067
  static void ggml_cuda_op_repeat(
@@ -6128,6 +6233,34 @@ inline void ggml_cuda_op_silu(
6128
6233
  (void) src1_dd;
6129
6234
  }
6130
6235
 
6236
+ inline void ggml_cuda_op_relu(
6237
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6238
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6239
+
6240
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6241
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6242
+
6243
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6244
+
6245
+ (void) src1;
6246
+ (void) dst;
6247
+ (void) src1_dd;
6248
+ }
6249
+
6250
+ inline void ggml_cuda_op_sqr(
6251
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6252
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6253
+
6254
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6255
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6256
+
6257
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6258
+
6259
+ (void) src1;
6260
+ (void) dst;
6261
+ (void) src1_dd;
6262
+ }
6263
+
6131
6264
  inline void ggml_cuda_op_norm(
6132
6265
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6133
6266
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6250,6 +6383,7 @@ static int64_t get_row_rounding(ggml_type type) {
6250
6383
  case GGML_TYPE_Q8_0:
6251
6384
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6252
6385
  case GGML_TYPE_F16:
6386
+ case GGML_TYPE_F32:
6253
6387
  return 1;
6254
6388
  case GGML_TYPE_Q2_K:
6255
6389
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6272,6 +6406,7 @@ static int64_t get_row_rounding(ggml_type type) {
6272
6406
  case GGML_TYPE_Q8_0:
6273
6407
  return 64;
6274
6408
  case GGML_TYPE_F16:
6409
+ case GGML_TYPE_F32:
6275
6410
  return 1;
6276
6411
  case GGML_TYPE_Q2_K:
6277
6412
  case GGML_TYPE_Q3_K:
@@ -6463,8 +6598,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6463
6598
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6464
6599
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6465
6600
  }
6466
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6467
-
6601
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6468
6602
  size_t dst_as = 0;
6469
6603
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6470
6604
 
@@ -6639,6 +6773,45 @@ inline void ggml_cuda_op_alibi(
6639
6773
  (void) src1_dd;
6640
6774
  }
6641
6775
 
6776
+ inline void ggml_cuda_op_im2col(
6777
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6778
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6779
+
6780
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6781
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6782
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6783
+
6784
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6785
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6786
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6787
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6788
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6789
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6790
+
6791
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6792
+
6793
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6794
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6795
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6796
+ const int64_t IW = src1->ne[0];
6797
+
6798
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6799
+ const int64_t KW = src0->ne[0];
6800
+
6801
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6802
+ const int64_t OW = dst->ne[1];
6803
+
6804
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6805
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6806
+
6807
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6808
+ OH, IW, IH, OW, IC, KH, KW, N,
6809
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6810
+
6811
+ (void) src0;
6812
+ (void) src0_dd;
6813
+ }
6814
+
6642
6815
  inline void ggml_cuda_op_diag_mask_inf(
6643
6816
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6644
6817
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6843,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
6843
7016
  const int64_t ne01 = src0->ne[1];
6844
7017
  const int64_t ne02 = src0->ne[2];
6845
7018
  const int64_t ne03 = src0->ne[3];
6846
- const int64_t nrows0 = ggml_nrows(src0);
7019
+ // const int64_t nrows0 = ggml_nrows(src0);
6847
7020
 
6848
7021
  const int64_t ne10 = src1->ne[0];
6849
7022
  const int64_t ne11 = src1->ne[1];
@@ -6944,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
6944
7117
  if (src0_on_device && src0_is_contiguous) {
6945
7118
  src0_dd[id] = (char *) src0_extra->data_device[id];
6946
7119
  } else {
6947
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7120
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6948
7121
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
6949
7122
  }
6950
7123
 
@@ -7160,6 +7333,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7160
7333
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7161
7334
  }
7162
7335
 
7336
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7337
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7338
+ }
7339
+
7340
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7341
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7342
+ }
7343
+
7163
7344
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7164
7345
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7165
7346
  }
@@ -7169,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7169
7350
  }
7170
7351
 
7171
7352
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
- if (!g_cublas_loaded) return false;
7353
+ if (!g_cublas_loaded) { return false; }
7173
7354
 
7174
7355
  const int64_t ne10 = src1->ne[0];
7175
7356
 
@@ -7247,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7247
7428
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7248
7429
  }
7249
7430
 
7250
- __global__ void k_compute_batched_ptrs(
7431
+ __global__ static void k_compute_batched_ptrs(
7251
7432
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7252
7433
  const void ** ptrs_src, void ** ptrs_dst,
7253
7434
  int ne12, int ne13,
@@ -7543,6 +7724,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7543
7724
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7544
7725
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7545
7726
  ne10, ne11, nb10, nb11, nb12, main_stream);
7727
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7728
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7729
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7546
7730
  } else {
7547
7731
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7548
7732
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7574,6 +7758,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7574
7758
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7575
7759
  }
7576
7760
 
7761
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7762
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7763
+ }
7764
+
7577
7765
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7578
7766
  (void) src0;
7579
7767
  (void) src1;
@@ -7685,11 +7873,11 @@ static size_t g_temp_tensor_extra_index = 0;
7685
7873
 
7686
7874
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7687
7875
  if (g_temp_tensor_extras == nullptr) {
7688
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7876
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7689
7877
  }
7690
7878
 
7691
7879
  size_t alloc_index = g_temp_tensor_extra_index;
7692
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7880
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7693
7881
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7694
7882
  memset(extra, 0, sizeof(*extra));
7695
7883
 
@@ -7856,7 +8044,7 @@ void ggml_cuda_free_scratch() {
7856
8044
  }
7857
8045
 
7858
8046
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
- if (!g_cublas_loaded) return false;
8047
+ if (!g_cublas_loaded) { return false; }
7860
8048
 
7861
8049
  ggml_cuda_func_t func;
7862
8050
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -7867,6 +8055,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7867
8055
  return false;
7868
8056
  }
7869
8057
 
8058
+ if (tensor->op == GGML_OP_MUL_MAT) {
8059
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8060
+ #ifndef NDEBUG
8061
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8062
+ #endif
8063
+ return false;
8064
+ }
8065
+ }
8066
+
7870
8067
  switch (tensor->op) {
7871
8068
  case GGML_OP_REPEAT:
7872
8069
  func = ggml_cuda_repeat;
@@ -7891,6 +8088,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7891
8088
  case GGML_UNARY_OP_SILU:
7892
8089
  func = ggml_cuda_silu;
7893
8090
  break;
8091
+ case GGML_UNARY_OP_RELU:
8092
+ func = ggml_cuda_relu;
8093
+ break;
7894
8094
  default:
7895
8095
  return false;
7896
8096
  } break;
@@ -7909,6 +8109,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7909
8109
  case GGML_OP_SCALE:
7910
8110
  func = ggml_cuda_scale;
7911
8111
  break;
8112
+ case GGML_OP_SQR:
8113
+ func = ggml_cuda_sqr;
8114
+ break;
7912
8115
  case GGML_OP_CLAMP:
7913
8116
  if (!any_on_device) {
7914
8117
  return false;
@@ -7939,6 +8142,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7939
8142
  case GGML_OP_ALIBI:
7940
8143
  func = ggml_cuda_alibi;
7941
8144
  break;
8145
+ case GGML_OP_IM2COL:
8146
+ func = ggml_cuda_im2col;
8147
+ break;
7942
8148
  default:
7943
8149
  return false;
7944
8150
  }
@@ -7998,11 +8204,11 @@ struct ggml_backend_buffer_context_cuda {
7998
8204
 
7999
8205
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
8000
8206
  if (temp_tensor_extras == nullptr) {
8001
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8207
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
8002
8208
  }
8003
8209
 
8004
8210
  size_t alloc_index = temp_tensor_extra_index;
8005
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8211
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
8006
8212
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
8007
8213
  memset(extra, 0, sizeof(*extra));
8008
8214
 
@@ -8088,7 +8294,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8088
8294
  ggml_cuda_set_device(g_main_device);
8089
8295
 
8090
8296
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8297
+
8298
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8299
+
8300
+ ggml_cuda_set_device(g_main_device);
8091
8301
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8302
+
8092
8303
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8093
8304
  }
8094
8305
 
@@ -8132,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8132
8343
  UNUSED(cgraph);
8133
8344
  }
8134
8345
 
8135
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8346
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8136
8347
  GGML_ASSERT(!"not implemented");
8137
8348
 
8138
8349
  UNUSED(backend);
8139
8350
  UNUSED(plan);
8140
8351
  }
8141
8352
 
8142
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8353
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8143
8354
  GGML_ASSERT(!"not implemented");
8144
8355
 
8145
8356
  UNUSED(backend);
@@ -8155,6 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8155
8366
  for (int i = 0; i < cgraph->n_nodes; i++) {
8156
8367
  ggml_tensor * node = cgraph->nodes[i];
8157
8368
 
8369
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8370
+ continue;
8371
+ }
8158
8372
  assert(node->backend == GGML_BACKEND_GPU);
8159
8373
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8160
8374
  if (node->src[j] != nullptr) {