llama_cpp 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  #include <algorithm>
2
+ #include <cinttypes>
2
3
  #include <cstddef>
3
4
  #include <cstdint>
4
5
  #include <limits>
@@ -81,12 +82,15 @@
81
82
 
82
83
  #include "ggml-cuda.h"
83
84
  #include "ggml.h"
85
+ #include "ggml-backend-impl.h"
84
86
 
85
87
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
88
  #define CC_VOLTA 700
87
89
  #define CC_OFFSET_AMD 1000000
88
90
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
91
 
92
+ #define GGML_CUDA_MAX_NODES 8192
93
+
90
94
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
95
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
96
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -232,7 +236,7 @@ typedef float2 dfloat2;
232
236
  #endif //GGML_CUDA_F16
233
237
 
234
238
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
235
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
239
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
236
240
 
237
241
  int x32 = 0;
238
242
  x32 |= x16[0] << 0;
@@ -242,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
242
246
  }
243
247
 
244
248
  static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
245
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
249
+ const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
246
250
 
247
251
  int x32 = 0;
248
252
  x32 |= x16[0] << 0;
@@ -252,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
252
256
  }
253
257
 
254
258
  static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
255
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
259
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
256
260
  }
257
261
 
258
262
  static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
259
- return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
263
+ return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
260
264
  }
261
265
 
262
266
  template<typename T>
@@ -433,6 +437,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
437
  #define CUDA_MUL_BLOCK_SIZE 256
434
438
  #define CUDA_GELU_BLOCK_SIZE 256
435
439
  #define CUDA_SILU_BLOCK_SIZE 256
440
+ #define CUDA_RELU_BLOCK_SIZE 256
441
+ #define CUDA_SQR_BLOCK_SIZE 256
436
442
  #define CUDA_CPY_BLOCK_SIZE 32
437
443
  #define CUDA_SCALE_BLOCK_SIZE 256
438
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -464,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
464
470
  #define MUL_MAT_SRC1_COL_STRIDE 128
465
471
 
466
472
  #define MAX_STREAMS 8
467
- static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
473
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
468
474
 
469
475
  struct ggml_tensor_extra_gpu {
470
476
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -553,6 +559,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
559
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
560
  }
555
561
 
562
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
563
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
564
+
565
+ if (i >= k) {
566
+ return;
567
+ }
568
+ dst[i] = fmaxf(x[i], 0);
569
+ }
570
+
571
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
572
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
573
+
574
+ if (i >= k) {
575
+ return;
576
+ }
577
+ dst[i] = x[i] * x[i];
578
+ }
579
+
556
580
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
581
  #pragma unroll
558
582
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -2225,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2225
2249
  }
2226
2250
 
2227
2251
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2252
+ (void)x_qh; (void)x_sc;
2228
2253
 
2229
2254
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2230
2255
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@@ -2236,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
2236
2261
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
2237
2262
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2238
2263
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2239
-
2264
+ (void)x_qh; (void)x_sc;
2240
2265
  GGML_CUDA_ASSUME(i_offset >= 0);
2241
2266
  GGML_CUDA_ASSUME(i_offset < nwarps);
2242
2267
  GGML_CUDA_ASSUME(k >= 0);
@@ -2245,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2245
2270
  const int kbx = k / QI4_0;
2246
2271
  const int kqsx = k % QI4_0;
2247
2272
 
2248
- const block_q4_0 * bx0 = (block_q4_0 *) vx;
2273
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
2249
2274
 
2250
2275
  float * x_dmf = (float *) x_dm;
2251
2276
 
@@ -2283,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2283
2308
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
2284
2309
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2285
2310
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2311
+ (void)x_qh; (void)x_sc;
2286
2312
 
2287
2313
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2288
- const float * x_dmf = (float *) x_dm;
2314
+ const float * x_dmf = (const float *) x_dm;
2289
2315
 
2290
2316
  int u[2*VDR_Q4_0_Q8_1_MMQ];
2291
2317
 
@@ -2319,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2319
2345
  }
2320
2346
 
2321
2347
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2348
+ (void)x_qh; (void)x_sc;
2322
2349
 
2323
2350
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2324
2351
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@@ -2330,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
2330
2357
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2331
2358
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2359
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2360
+ (void)x_qh; (void)x_sc;
2333
2361
 
2334
2362
  GGML_CUDA_ASSUME(i_offset >= 0);
2335
2363
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2339,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2339
2367
  const int kbx = k / QI4_1;
2340
2368
  const int kqsx = k % QI4_1;
2341
2369
 
2342
- const block_q4_1 * bx0 = (block_q4_1 *) vx;
2370
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
2343
2371
 
2344
2372
  #pragma unroll
2345
2373
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2374,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2374
2402
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2375
2403
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2404
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2405
+ (void)x_qh; (void)x_sc;
2377
2406
 
2378
2407
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2379
2408
 
@@ -2411,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2411
2440
  }
2412
2441
 
2413
2442
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2443
+ (void)x_qh; (void)x_sc;
2414
2444
 
2415
2445
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2416
2446
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@@ -2422,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
2422
2452
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2423
2453
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2424
2454
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2455
+ (void)x_qh; (void)x_sc;
2425
2456
 
2426
2457
  GGML_CUDA_ASSUME(i_offset >= 0);
2427
2458
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2431,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2431
2462
  const int kbx = k / QI5_0;
2432
2463
  const int kqsx = k % QI5_0;
2433
2464
 
2434
- const block_q5_0 * bx0 = (block_q5_0 *) vx;
2465
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
2435
2466
 
2436
2467
  #pragma unroll
2437
2468
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2486,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2486
2517
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2487
2518
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2488
2519
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2520
+ (void)x_qh; (void)x_sc;
2489
2521
 
2490
2522
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2491
2523
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@@ -2525,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2525
2557
  }
2526
2558
 
2527
2559
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2560
+ (void)x_qh; (void)x_sc;
2528
2561
 
2529
2562
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2530
2563
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@@ -2536,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
2536
2569
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2537
2570
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2538
2571
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2572
+ (void)x_qh; (void)x_sc;
2539
2573
 
2540
2574
  GGML_CUDA_ASSUME(i_offset >= 0);
2541
2575
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2545,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2545
2579
  const int kbx = k / QI5_1;
2546
2580
  const int kqsx = k % QI5_1;
2547
2581
 
2548
- const block_q5_1 * bx0 = (block_q5_1 *) vx;
2582
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
2549
2583
 
2550
2584
  #pragma unroll
2551
2585
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2597,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2597
2631
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2598
2632
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2599
2633
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2634
+ (void)x_qh; (void)x_sc;
2600
2635
 
2601
2636
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2602
2637
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@@ -2631,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2631
2666
  }
2632
2667
 
2633
2668
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2669
+ (void)x_qh; (void)x_sc;
2634
2670
 
2635
2671
  __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2636
2672
  __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@@ -2642,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
2642
2678
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2643
2679
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2644
2680
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2681
+ (void)x_qh; (void)x_sc;
2645
2682
 
2646
2683
  GGML_CUDA_ASSUME(i_offset >= 0);
2647
2684
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2652,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2652
2689
  const int kqsx = k % QI8_0;
2653
2690
  float * x_dmf = (float *) x_dm;
2654
2691
 
2655
- const block_q8_0 * bx0 = (block_q8_0 *) vx;
2692
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
2656
2693
 
2657
2694
  #pragma unroll
2658
2695
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2687,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2687
2724
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2688
2725
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2689
2726
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2727
+ (void)x_qh; (void)x_sc;
2690
2728
 
2691
2729
  const float * x_dmf = (const float *) x_dm;
2692
2730
  const float * y_df = (const float *) y_ds;
@@ -2720,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2720
2758
  }
2721
2759
 
2722
2760
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2761
+ (void)x_qh;
2723
2762
 
2724
2763
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2725
2764
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@@ -2733,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
2733
2772
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2734
2773
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2735
2774
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2775
+ (void)x_qh;
2736
2776
 
2737
2777
  GGML_CUDA_ASSUME(i_offset >= 0);
2738
2778
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -2742,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2742
2782
  const int kbx = k / QI2_K;
2743
2783
  const int kqsx = k % QI2_K;
2744
2784
 
2745
- const block_q2_K * bx0 = (block_q2_K *) vx;
2785
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
2746
2786
 
2747
2787
  #pragma unroll
2748
2788
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2790,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2790
2830
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2791
2831
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2792
2832
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2833
+ (void)x_qh;
2793
2834
 
2794
2835
  const int kbx = k / QI2_K;
2795
2836
  const int ky = (k % QI2_K) * QR2_K;
@@ -2863,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2863
2904
  const int kbx = k / QI3_K;
2864
2905
  const int kqsx = k % QI3_K;
2865
2906
 
2866
- const block_q3_K * bx0 = (block_q3_K *) vx;
2907
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
2867
2908
 
2868
2909
  #pragma unroll
2869
2910
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -2944,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2944
2985
  const float * x_dmf = (const float *) x_dm;
2945
2986
  const float * y_df = (const float *) y_ds;
2946
2987
 
2947
- const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2988
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2948
2989
 
2949
2990
  int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2950
2991
 
@@ -3059,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3059
3100
  }
3060
3101
 
3061
3102
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3103
+ (void)x_qh;
3062
3104
 
3063
3105
  __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
3064
3106
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@@ -3072,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
3072
3114
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
3073
3115
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3074
3116
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3117
+ (void)x_qh;
3075
3118
 
3076
3119
  GGML_CUDA_ASSUME(i_offset >= 0);
3077
3120
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3081,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3081
3124
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
3082
3125
  const int kqsx = k % QI4_K; // == k if QK_K == 256
3083
3126
 
3084
- const block_q4_K * bx0 = (block_q4_K *) vx;
3127
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
3085
3128
 
3086
3129
  #pragma unroll
3087
3130
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3126,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3126
3169
 
3127
3170
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
3128
3171
 
3129
- const int * scales = (int *) bxi->scales;
3172
+ const int * scales = (const int *) bxi->scales;
3130
3173
 
3131
3174
  const int ksc = k % (WARP_SIZE/8);
3132
3175
 
@@ -3141,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3141
3184
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
3142
3185
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3143
3186
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3187
+ (void)x_qh;
3144
3188
 
3145
3189
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
3146
3190
 
@@ -3240,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3240
3284
  }
3241
3285
 
3242
3286
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3287
+ (void)x_qh;
3243
3288
 
3244
3289
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3245
3290
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@@ -3253,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
3253
3298
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
3254
3299
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3255
3300
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3301
+ (void)x_qh;
3256
3302
 
3257
3303
  GGML_CUDA_ASSUME(i_offset >= 0);
3258
3304
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3262,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3262
3308
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3263
3309
  const int kqsx = k % QI5_K; // == k if QK_K == 256
3264
3310
 
3265
- const block_q5_K * bx0 = (block_q5_K *) vx;
3311
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
3266
3312
 
3267
3313
  #pragma unroll
3268
3314
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3318,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3318
3364
 
3319
3365
  const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
3320
3366
 
3321
- const int * scales = (int *) bxi->scales;
3367
+ const int * scales = (const int *) bxi->scales;
3322
3368
 
3323
3369
  const int ksc = k % (WARP_SIZE/8);
3324
3370
 
@@ -3333,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3333
3379
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3334
3380
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3335
3381
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3382
+ (void)x_qh;
3336
3383
 
3337
3384
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3338
3385
 
@@ -3369,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3369
3416
  }
3370
3417
 
3371
3418
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3419
+ (void)x_qh;
3372
3420
 
3373
3421
  __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3374
3422
  __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@@ -3382,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
3382
3430
  template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3383
3431
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3384
3432
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3433
+ (void)x_qh;
3385
3434
 
3386
3435
  GGML_CUDA_ASSUME(i_offset >= 0);
3387
3436
  GGML_CUDA_ASSUME(i_offset < nwarps);
@@ -3391,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3391
3440
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3392
3441
  const int kqsx = k % QI6_K; // == k if QK_K == 256
3393
3442
 
3394
- const block_q6_K * bx0 = (block_q6_K *) vx;
3443
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
3395
3444
 
3396
3445
  #pragma unroll
3397
3446
  for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@@ -3453,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3453
3502
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3454
3503
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3455
3504
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3505
+ (void)x_qh;
3456
3506
 
3457
3507
  const float * x_dmf = (const float *) x_dm;
3458
3508
  const float * y_df = (const float *) y_ds;
@@ -3495,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
3495
3545
  __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3496
3546
  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3497
3547
 
3498
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3548
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
3499
3549
 
3500
3550
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3501
3551
 
@@ -4468,6 +4518,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4518
  *dsti = __float2half(*xi);
4469
4519
  }
4470
4520
 
4521
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4522
+ const half * xi = (const half *) cxi;
4523
+ half * dsti = (half *) cdsti;
4524
+
4525
+ *dsti = *xi;
4526
+ }
4527
+
4471
4528
  template <cpy_kernel_t cpy_1>
4472
4529
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4530
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4778,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4778
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4779
  }
4723
4780
 
4781
+ static __global__ void im2col_f32_f16(
4782
+ const float * x, half * dst,
4783
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4784
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4785
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4786
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4787
+
4788
+ const int offset_dst =
4789
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4790
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4791
+
4792
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4793
+ dst[offset_dst] = __float2half(0.0f);
4794
+ } else {
4795
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4796
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4797
+ }
4798
+ }
4799
+
4724
4800
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4801
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4802
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4835,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4835
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4836
  }
4761
4837
 
4838
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4839
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4840
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4841
+ }
4842
+
4843
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4844
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4845
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4846
+ }
4847
+
4762
4848
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4849
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4850
  if (ncols < 1024) {
@@ -5611,6 +5697,16 @@ static void ggml_cpy_f32_f16_cuda(
5611
5697
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5612
5698
  }
5613
5699
 
5700
+ static void ggml_cpy_f16_f16_cuda(
5701
+ const char * cx, char * cdst, const int ne,
5702
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5703
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5704
+
5705
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5706
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5707
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5708
+ }
5709
+
5614
5710
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5615
5711
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5616
5712
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5694,6 +5790,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5694
5790
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5695
5791
  }
5696
5792
 
5793
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5794
+ int OH, int IW, int IH, int OW, int IC,
5795
+ int KH, int KW, int N, int ofs0, int ofs1,
5796
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5797
+ dim3 block_nums(IC, OH, OW);
5798
+ dim3 block_dims(N, KH, KW);
5799
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5800
+ }
5801
+
5697
5802
  // buffer pool for cuda
5698
5803
  #define MAX_CUDA_BUFFERS 256
5699
5804
 
@@ -5762,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5762
5867
  return ptr;
5763
5868
  }
5764
5869
  #ifdef DEBUG_CUDA_MALLOC
5765
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5870
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5766
5871
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5767
5872
  #endif
5768
5873
  void * ptr;
@@ -5900,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5900
6005
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5901
6006
  // This can fixed the OOM error in WSL.
5902
6007
  cudaGetLastError();
5903
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6008
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5904
6009
  size/1024.0/1024.0, cudaGetErrorString(err));
5905
6010
  return nullptr;
5906
6011
  }
@@ -5945,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5945
6050
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
5946
6051
  if (nb0 == ts && nb1 == ts*ne0/bs) {
5947
6052
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
5948
- } else if (nb0 == ts) {
6053
+ }
6054
+ if (nb0 == ts) {
5949
6055
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
5950
- } else {
5951
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
5952
- const void * rx = (const void *) ((const char *) x + i1*nb1);
5953
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
5954
- // pretend the row is a matrix with cols=1
5955
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
5956
- if (r != cudaSuccess) return r;
5957
- }
5958
- return cudaSuccess;
5959
6056
  }
6057
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
+ // pretend the row is a matrix with cols=1
6061
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
+ if (r != cudaSuccess) { return r; }
6063
+ }
6064
+ return cudaSuccess;
5960
6065
  }
5961
6066
 
5962
6067
  static void ggml_cuda_op_repeat(
@@ -6128,6 +6233,34 @@ inline void ggml_cuda_op_silu(
6128
6233
  (void) src1_dd;
6129
6234
  }
6130
6235
 
6236
+ inline void ggml_cuda_op_relu(
6237
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6238
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6239
+
6240
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6241
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6242
+
6243
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6244
+
6245
+ (void) src1;
6246
+ (void) dst;
6247
+ (void) src1_dd;
6248
+ }
6249
+
6250
+ inline void ggml_cuda_op_sqr(
6251
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6252
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6253
+
6254
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6255
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6256
+
6257
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6258
+
6259
+ (void) src1;
6260
+ (void) dst;
6261
+ (void) src1_dd;
6262
+ }
6263
+
6131
6264
  inline void ggml_cuda_op_norm(
6132
6265
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6133
6266
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6250,6 +6383,7 @@ static int64_t get_row_rounding(ggml_type type) {
6250
6383
  case GGML_TYPE_Q8_0:
6251
6384
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6252
6385
  case GGML_TYPE_F16:
6386
+ case GGML_TYPE_F32:
6253
6387
  return 1;
6254
6388
  case GGML_TYPE_Q2_K:
6255
6389
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6272,6 +6406,7 @@ static int64_t get_row_rounding(ggml_type type) {
6272
6406
  case GGML_TYPE_Q8_0:
6273
6407
  return 64;
6274
6408
  case GGML_TYPE_F16:
6409
+ case GGML_TYPE_F32:
6275
6410
  return 1;
6276
6411
  case GGML_TYPE_Q2_K:
6277
6412
  case GGML_TYPE_Q3_K:
@@ -6463,8 +6598,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6463
6598
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6464
6599
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6465
6600
  }
6466
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6467
-
6601
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6468
6602
  size_t dst_as = 0;
6469
6603
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6470
6604
 
@@ -6639,6 +6773,45 @@ inline void ggml_cuda_op_alibi(
6639
6773
  (void) src1_dd;
6640
6774
  }
6641
6775
 
6776
+ inline void ggml_cuda_op_im2col(
6777
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6778
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6779
+
6780
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6781
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6782
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6783
+
6784
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6785
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6786
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6787
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6788
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6789
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6790
+
6791
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6792
+
6793
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6794
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6795
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6796
+ const int64_t IW = src1->ne[0];
6797
+
6798
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6799
+ const int64_t KW = src0->ne[0];
6800
+
6801
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6802
+ const int64_t OW = dst->ne[1];
6803
+
6804
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6805
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6806
+
6807
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6808
+ OH, IW, IH, OW, IC, KH, KW, N,
6809
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6810
+
6811
+ (void) src0;
6812
+ (void) src0_dd;
6813
+ }
6814
+
6642
6815
  inline void ggml_cuda_op_diag_mask_inf(
6643
6816
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6644
6817
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6843,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
6843
7016
  const int64_t ne01 = src0->ne[1];
6844
7017
  const int64_t ne02 = src0->ne[2];
6845
7018
  const int64_t ne03 = src0->ne[3];
6846
- const int64_t nrows0 = ggml_nrows(src0);
7019
+ // const int64_t nrows0 = ggml_nrows(src0);
6847
7020
 
6848
7021
  const int64_t ne10 = src1->ne[0];
6849
7022
  const int64_t ne11 = src1->ne[1];
@@ -6944,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
6944
7117
  if (src0_on_device && src0_is_contiguous) {
6945
7118
  src0_dd[id] = (char *) src0_extra->data_device[id];
6946
7119
  } else {
6947
- const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7120
+ // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6948
7121
  src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
6949
7122
  }
6950
7123
 
@@ -7160,6 +7333,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7160
7333
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7161
7334
  }
7162
7335
 
7336
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7337
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7338
+ }
7339
+
7340
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7341
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7342
+ }
7343
+
7163
7344
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7164
7345
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7165
7346
  }
@@ -7169,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7169
7350
  }
7170
7351
 
7171
7352
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
- if (!g_cublas_loaded) return false;
7353
+ if (!g_cublas_loaded) { return false; }
7173
7354
 
7174
7355
  const int64_t ne10 = src1->ne[0];
7175
7356
 
@@ -7247,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7247
7428
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7248
7429
  }
7249
7430
 
7250
- __global__ void k_compute_batched_ptrs(
7431
+ __global__ static void k_compute_batched_ptrs(
7251
7432
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7252
7433
  const void ** ptrs_src, void ** ptrs_dst,
7253
7434
  int ne12, int ne13,
@@ -7543,6 +7724,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7543
7724
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7544
7725
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7545
7726
  ne10, ne11, nb10, nb11, nb12, main_stream);
7727
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7728
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7729
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7546
7730
  } else {
7547
7731
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7548
7732
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7574,6 +7758,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7574
7758
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7575
7759
  }
7576
7760
 
7761
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7762
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7763
+ }
7764
+
7577
7765
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7578
7766
  (void) src0;
7579
7767
  (void) src1;
@@ -7685,11 +7873,11 @@ static size_t g_temp_tensor_extra_index = 0;
7685
7873
 
7686
7874
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7687
7875
  if (g_temp_tensor_extras == nullptr) {
7688
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7876
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7689
7877
  }
7690
7878
 
7691
7879
  size_t alloc_index = g_temp_tensor_extra_index;
7692
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7880
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7693
7881
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7694
7882
  memset(extra, 0, sizeof(*extra));
7695
7883
 
@@ -7856,7 +8044,7 @@ void ggml_cuda_free_scratch() {
7856
8044
  }
7857
8045
 
7858
8046
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
- if (!g_cublas_loaded) return false;
8047
+ if (!g_cublas_loaded) { return false; }
7860
8048
 
7861
8049
  ggml_cuda_func_t func;
7862
8050
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -7867,6 +8055,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7867
8055
  return false;
7868
8056
  }
7869
8057
 
8058
+ if (tensor->op == GGML_OP_MUL_MAT) {
8059
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8060
+ #ifndef NDEBUG
8061
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8062
+ #endif
8063
+ return false;
8064
+ }
8065
+ }
8066
+
7870
8067
  switch (tensor->op) {
7871
8068
  case GGML_OP_REPEAT:
7872
8069
  func = ggml_cuda_repeat;
@@ -7891,6 +8088,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7891
8088
  case GGML_UNARY_OP_SILU:
7892
8089
  func = ggml_cuda_silu;
7893
8090
  break;
8091
+ case GGML_UNARY_OP_RELU:
8092
+ func = ggml_cuda_relu;
8093
+ break;
7894
8094
  default:
7895
8095
  return false;
7896
8096
  } break;
@@ -7909,6 +8109,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7909
8109
  case GGML_OP_SCALE:
7910
8110
  func = ggml_cuda_scale;
7911
8111
  break;
8112
+ case GGML_OP_SQR:
8113
+ func = ggml_cuda_sqr;
8114
+ break;
7912
8115
  case GGML_OP_CLAMP:
7913
8116
  if (!any_on_device) {
7914
8117
  return false;
@@ -7939,6 +8142,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7939
8142
  case GGML_OP_ALIBI:
7940
8143
  func = ggml_cuda_alibi;
7941
8144
  break;
8145
+ case GGML_OP_IM2COL:
8146
+ func = ggml_cuda_im2col;
8147
+ break;
7942
8148
  default:
7943
8149
  return false;
7944
8150
  }
@@ -7998,11 +8204,11 @@ struct ggml_backend_buffer_context_cuda {
7998
8204
 
7999
8205
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
8000
8206
  if (temp_tensor_extras == nullptr) {
8001
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8207
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
8002
8208
  }
8003
8209
 
8004
8210
  size_t alloc_index = temp_tensor_extra_index;
8005
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8211
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
8006
8212
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
8007
8213
  memset(extra, 0, sizeof(*extra));
8008
8214
 
@@ -8088,7 +8294,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8088
8294
  ggml_cuda_set_device(g_main_device);
8089
8295
 
8090
8296
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8297
+
8298
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8299
+
8300
+ ggml_cuda_set_device(g_main_device);
8091
8301
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8302
+
8092
8303
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8093
8304
  }
8094
8305
 
@@ -8132,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8132
8343
  UNUSED(cgraph);
8133
8344
  }
8134
8345
 
8135
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8346
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8136
8347
  GGML_ASSERT(!"not implemented");
8137
8348
 
8138
8349
  UNUSED(backend);
8139
8350
  UNUSED(plan);
8140
8351
  }
8141
8352
 
8142
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8353
+ [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8143
8354
  GGML_ASSERT(!"not implemented");
8144
8355
 
8145
8356
  UNUSED(backend);
@@ -8155,6 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8155
8366
  for (int i = 0; i < cgraph->n_nodes; i++) {
8156
8367
  ggml_tensor * node = cgraph->nodes[i];
8157
8368
 
8369
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
8370
+ continue;
8371
+ }
8158
8372
  assert(node->backend == GGML_BACKEND_GPU);
8159
8373
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8160
8374
  if (node->src[j] != nullptr) {