llama_cpp 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -62,6 +62,7 @@
62
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
63
  #define cudaMemcpyKind hipMemcpyKind
64
64
  #define cudaMemset hipMemset
65
+ #define cudaMemsetAsync hipMemsetAsync
65
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
67
  #define cudaSetDevice hipSetDevice
67
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
414
415
  #define CUDA_SILU_BLOCK_SIZE 256
415
416
  #define CUDA_CPY_BLOCK_SIZE 32
416
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
+ #define CUDA_CLAMP_BLOCK_SIZE 256
417
419
  #define CUDA_ROPE_BLOCK_SIZE 256
418
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
422
425
 
423
426
  // dmmv = dequantize_mul_mat_vec
424
427
  #ifndef GGML_CUDA_DMMV_X
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1574
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1575
1578
  }
1576
1579
 
1580
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
+ static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
+
1585
+ if (col >= ncols) {
1586
+ return;
1587
+ }
1588
+
1589
+ const int r = y[row];
1590
+
1591
+ // copy x[r*ncols + col] to dst[row*ncols + col]
1592
+ const int xi = r*ncols + col;
1593
+ const int di = row*ncols + col;
1594
+
1595
+ const int ib = xi/qk; // block index
1596
+ const int iqs = (xi%qk)/qr; // quant index
1597
+ const int iybs = di - di%qk; // y block start index
1598
+ const int y_offset = qr == 1 ? 1 : qk/2;
1599
+
1600
+ // dequantize
1601
+ dfloat2 v;
1602
+ dequantize_kernel(x, ib, iqs, v);
1603
+
1604
+ dst[iybs + iqs + 0] = v.x;
1605
+ dst[iybs + iqs + y_offset] = v.y;
1606
+ }
1607
+
1577
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1579
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4555
4586
  dst[i] = scale * x[i];
4556
4587
  }
4557
4588
 
4589
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
+
4592
+ if (i >= k) {
4593
+ return;
4594
+ }
4595
+
4596
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
+ }
4598
+
4599
+ template<int qk, int qr, dequantize_kernel_t dq>
4600
+ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
+ const dim3 block_nums(block_num_x, nrows, 1);
4604
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
+ }
4606
+
4558
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4559
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4560
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5436
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5437
5486
  }
5438
5487
 
5488
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
+ }
5492
+
5439
5493
  template<typename T>
5440
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5703,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5703
5757
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5704
5758
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5705
5759
  kind = cudaMemcpyDeviceToDevice;
5706
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5760
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5707
5761
  int id;
5708
5762
  CUDA_CHECK(cudaGetDevice(&id));
5709
5763
  src_ptr = (char *) extra->data_device[id];
@@ -5739,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5739
5793
  }
5740
5794
  }
5741
5795
 
5796
+ static void ggml_cuda_op_repeat(
5797
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5798
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5799
+ // guaranteed to be an integer due to the check in ggml_can_repeat
5800
+ const int64_t ne0 = dst->ne[0];
5801
+ const int64_t ne1 = dst->ne[1];
5802
+ const int64_t ne2 = dst->ne[2];
5803
+ const int64_t ne3 = dst->ne[3];
5804
+
5805
+ const int64_t ne00 = src0->ne[0];
5806
+ const int64_t ne01 = src0->ne[1];
5807
+ const int64_t ne02 = src0->ne[2];
5808
+ const int64_t ne03 = src0->ne[3];
5809
+
5810
+ const size_t nb0 = dst->nb[0];
5811
+ const size_t nb1 = dst->nb[1];
5812
+ const size_t nb2 = dst->nb[2];
5813
+ const size_t nb3 = dst->nb[3];
5814
+
5815
+ const size_t nb00 = src0->nb[0];
5816
+ const size_t nb01 = src0->nb[1];
5817
+ const size_t nb02 = src0->nb[2];
5818
+ const size_t nb03 = src0->nb[3];
5819
+
5820
+ const int nr0 = (int)(ne0/ne00);
5821
+ const int nr1 = (int)(ne1/ne01);
5822
+ const int nr2 = (int)(ne2/ne02);
5823
+ const int nr3 = (int)(ne3/ne03);
5824
+
5825
+ // TODO: support for transposed / permuted tensors
5826
+ GGML_ASSERT(nb0 == sizeof(float));
5827
+ GGML_ASSERT(nb00 == sizeof(float));
5828
+
5829
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5830
+ for (int i3 = 0; i3 < nr3; i3++) {
5831
+ for (int k3 = 0; k3 < ne03; k3++) {
5832
+ for (int i2 = 0; i2 < nr2; i2++) {
5833
+ for (int k2 = 0; k2 < ne02; k2++) {
5834
+ for (int i1 = 0; i1 < nr1; i1++) {
5835
+ for (int k1 = 0; k1 < ne01; k1++) {
5836
+ for (int i0 = 0; i0 < nr0; i0++) {
5837
+ CUDA_CHECK(cudaMemcpyAsync(
5838
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5839
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5840
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5841
+ }
5842
+ }
5843
+ }
5844
+ }
5845
+ }
5846
+ }
5847
+ }
5848
+
5849
+ (void) src1;
5850
+ (void) src1_d;
5851
+ }
5852
+
5853
+ static void ggml_cuda_op_get_rows(
5854
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5855
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5856
+
5857
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5858
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
5859
+ GGML_ASSERT(ggml_is_contiguous(src0));
5860
+ GGML_ASSERT(ggml_is_contiguous(src1));
5861
+ GGML_ASSERT(ggml_is_contiguous(dst));
5862
+
5863
+ const int ncols = src0->ne[0];
5864
+ const int nrows = ggml_nelements(src1);
5865
+
5866
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
5867
+
5868
+ switch (src0->type) {
5869
+ case GGML_TYPE_F16:
5870
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5871
+ break;
5872
+ case GGML_TYPE_F32:
5873
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5874
+ break;
5875
+ case GGML_TYPE_Q4_0:
5876
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5877
+ break;
5878
+ case GGML_TYPE_Q4_1:
5879
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5880
+ break;
5881
+ case GGML_TYPE_Q5_0:
5882
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5883
+ break;
5884
+ case GGML_TYPE_Q5_1:
5885
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5886
+ break;
5887
+ case GGML_TYPE_Q8_0:
5888
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5889
+ break;
5890
+ default:
5891
+ // TODO: k-quants
5892
+ GGML_ASSERT(false);
5893
+ break;
5894
+ }
5895
+ }
5896
+
5742
5897
  inline void ggml_cuda_op_add(
5743
5898
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5744
5899
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6279,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
6279
6434
  const int64_t ne02 = src0->ne[2];
6280
6435
  const int64_t nrows = ggml_nrows(src0);
6281
6436
 
6282
- const int n_past = ((int32_t *) dst->op_params)[0];
6437
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6283
6438
  const int n_head = ((int32_t *) dst->op_params)[1];
6284
6439
  float max_bias;
6285
6440
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6286
6441
 
6287
- GGML_ASSERT(ne01 + n_past == ne00);
6442
+ //GGML_ASSERT(ne01 + n_past == ne00);
6288
6443
  GGML_ASSERT(n_head == ne02);
6289
6444
 
6290
6445
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6343,7 +6498,14 @@ inline void ggml_cuda_op_scale(
6343
6498
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6344
6499
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6345
6500
 
6346
- const float scale = ((float *) src1->data)[0];
6501
+ float scale;
6502
+ // HACK: support for ggml backend interface
6503
+ if (src1->backend == GGML_BACKEND_CPU) {
6504
+ scale = ((float *) src1->data)[0];
6505
+ } else {
6506
+ // TODO: pass pointer to kernel instead of copying to host
6507
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6508
+ }
6347
6509
 
6348
6510
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6349
6511
  CUDA_CHECK(cudaGetLastError());
@@ -6353,6 +6515,24 @@ inline void ggml_cuda_op_scale(
6353
6515
  (void) src1_dd;
6354
6516
  }
6355
6517
 
6518
+ inline void ggml_cuda_op_clamp(
6519
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6520
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6521
+
6522
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
+
6525
+ const float min = ((float *) dst->op_params)[0];
6526
+ const float max = ((float *) dst->op_params)[1];
6527
+
6528
+ clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
+ CUDA_CHECK(cudaGetLastError());
6530
+
6531
+ (void) src1;
6532
+ (void) dst;
6533
+ (void) src1_dd;
6534
+ }
6535
+
6356
6536
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6357
6537
  const int64_t nrows0 = ggml_nrows(src0);
6358
6538
 
@@ -6362,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6362
6542
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6363
6543
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6364
6544
 
6365
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6367
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6545
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6546
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6547
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6368
6548
 
6369
6549
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6370
6550
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6505,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
6505
6685
  const size_t q8_1_ts = sizeof(block_q8_1);
6506
6686
  const size_t q8_1_bs = QK8_1;
6507
6687
 
6508
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6509
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6510
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6688
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6689
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6690
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6511
6691
 
6512
6692
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6513
6693
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6585,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
6585
6765
  if (convert_src1_to_q8_1) {
6586
6766
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6587
6767
 
6588
- if (split && src1_on_device && src1_is_contiguous) {
6768
+ if (src1_on_device && src1_is_contiguous) {
6589
6769
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6590
6770
  CUDA_CHECK(cudaGetLastError());
6591
6771
  }
@@ -6667,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
6667
6847
  GGML_ASSERT(false);
6668
6848
  }
6669
6849
 
6670
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6850
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6671
6851
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6672
6852
  CUDA_CHECK(cudaGetLastError());
6673
6853
  }
@@ -6758,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
6758
6938
  }
6759
6939
  }
6760
6940
 
6941
+ static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6943
+ }
6944
+
6945
+ static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6947
+ }
6948
+
6761
6949
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6762
6950
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6763
6951
  }
@@ -6812,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6812
7000
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6813
7001
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6814
7002
 
6815
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7003
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6816
7004
  void * src0_ddq = src0_extra->data_device[g_main_device];
6817
7005
 
6818
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7006
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6819
7007
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6820
7008
 
6821
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7009
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6822
7010
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6823
7011
 
6824
7012
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6843,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
6843
7031
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6844
7032
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6845
7033
 
6846
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7034
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6847
7035
  void * src0_ddq = src0_extra->data_device[g_main_device];
6848
7036
 
6849
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7037
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6850
7038
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6851
7039
 
6852
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7040
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6853
7041
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6854
7042
 
6855
7043
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6870,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
6870
7058
  }
6871
7059
  }
6872
7060
 
6873
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7061
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6874
7062
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6875
7063
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6876
7064
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6877
- }else if (src0->type == GGML_TYPE_F32) {
7065
+ } else if (src0->type == GGML_TYPE_F32) {
6878
7066
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6879
7067
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6880
7068
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6906,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
6906
7094
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6907
7095
  }
6908
7096
 
7097
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7099
+ }
7100
+
6909
7101
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6910
7102
  const int64_t ne = ggml_nelements(src0);
6911
7103
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -6935,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
6935
7127
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6936
7128
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6937
7129
 
6938
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6939
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7130
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7131
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6940
7132
 
6941
7133
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6942
7134
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6991,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6991
7183
 
6992
7184
  const size_t nb1 = tensor->nb[1];
6993
7185
 
6994
- ggml_backend backend = tensor->backend;
6995
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7186
+ ggml_backend_type backend = tensor->backend;
7187
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6996
7188
  memset(extra, 0, sizeof(*extra));
6997
7189
 
6998
7190
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7046,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7046
7238
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7047
7239
  }
7048
7240
 
7049
-
7050
7241
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7051
7242
 
7052
7243
  extra->data_device[id] = buf;
@@ -7085,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7085
7276
  delete extra;
7086
7277
  }
7087
7278
 
7088
- static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7279
+ static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7089
7280
  static size_t g_temp_tensor_extra_index = 0;
7090
7281
 
7091
- static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7282
+ static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7092
7283
  if (g_temp_tensor_extras == nullptr) {
7093
7284
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7094
7285
  }
7095
7286
 
7096
7287
  size_t alloc_index = g_temp_tensor_extra_index;
7097
7288
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7098
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7289
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7099
7290
  memset(extra, 0, sizeof(*extra));
7100
7291
 
7101
7292
  return extra;
@@ -7123,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7123
7314
  return;
7124
7315
  }
7125
7316
 
7126
- struct ggml_tensor_extra_gpu * extra;
7317
+ ggml_tensor_extra_gpu * extra;
7127
7318
 
7128
7319
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7129
7320
  tensor->op == GGML_OP_VIEW ||
@@ -7132,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7132
7323
 
7133
7324
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7134
7325
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7135
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7326
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7136
7327
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7137
7328
  size_t offset = 0;
7138
7329
  if (tensor->op == GGML_OP_VIEW) {
@@ -7141,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7141
7332
  extra = ggml_cuda_alloc_temp_tensor_extra();
7142
7333
  extra->data_device[g_main_device] = src0_ddc + offset;
7143
7334
  } else if (tensor->op == GGML_OP_CPY) {
7144
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7335
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7145
7336
  void * src1_ddv = src1_extra->data_device[g_main_device];
7146
7337
  extra = ggml_cuda_alloc_temp_tensor_extra();
7147
7338
  extra->data_device[g_main_device] = src1_ddv;
@@ -7183,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7183
7374
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7184
7375
  }
7185
7376
 
7186
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7377
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7187
7378
 
7188
7379
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7189
7380
  tensor->op == GGML_OP_VIEW;
7190
7381
 
7191
7382
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7192
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7383
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7193
7384
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7194
7385
  size_t view_offset = 0;
7195
7386
  if (tensor->op == GGML_OP_VIEW) {
@@ -7207,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7207
7398
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7208
7399
  GGML_ASSERT(ggml_is_contiguous(tensor));
7209
7400
 
7210
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7401
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7211
7402
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7212
7403
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7213
7404
  }
@@ -7264,58 +7455,47 @@ void ggml_cuda_free_scratch() {
7264
7455
  g_scratch_buffer = nullptr;
7265
7456
  }
7266
7457
 
7267
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7458
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7268
7459
  ggml_cuda_func_t func;
7269
7460
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7270
7461
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7271
7462
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7272
7463
 
7464
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7465
+ return false;
7466
+ }
7467
+
7273
7468
  switch (tensor->op) {
7469
+ case GGML_OP_REPEAT:
7470
+ func = ggml_cuda_repeat;
7471
+ break;
7472
+ case GGML_OP_GET_ROWS:
7473
+ func = ggml_cuda_get_rows;
7474
+ break;
7274
7475
  case GGML_OP_DUP:
7275
- if (!any_on_device) {
7276
- return false;
7277
- }
7278
7476
  func = ggml_cuda_dup;
7279
7477
  break;
7280
7478
  case GGML_OP_ADD:
7281
- if (!any_on_device) {
7282
- return false;
7283
- }
7284
7479
  func = ggml_cuda_add;
7285
7480
  break;
7286
7481
  case GGML_OP_MUL:
7287
- if (!any_on_device) {
7288
- return false;
7289
- }
7290
7482
  func = ggml_cuda_mul;
7291
7483
  break;
7292
7484
  case GGML_OP_UNARY:
7293
7485
  switch (ggml_get_unary_op(tensor)) {
7294
7486
  case GGML_UNARY_OP_GELU:
7295
- if (!any_on_device) {
7296
- return false;
7297
- }
7298
7487
  func = ggml_cuda_gelu;
7299
7488
  break;
7300
7489
  case GGML_UNARY_OP_SILU:
7301
- if (!any_on_device) {
7302
- return false;
7303
- }
7304
7490
  func = ggml_cuda_silu;
7305
7491
  break;
7306
7492
  default:
7307
7493
  return false;
7308
7494
  } break;
7309
7495
  case GGML_OP_NORM:
7310
- if (!any_on_device) {
7311
- return false;
7312
- }
7313
7496
  func = ggml_cuda_norm;
7314
7497
  break;
7315
7498
  case GGML_OP_RMS_NORM:
7316
- if (!any_on_device) {
7317
- return false;
7318
- }
7319
7499
  func = ggml_cuda_rms_norm;
7320
7500
  break;
7321
7501
  case GGML_OP_MUL_MAT:
@@ -7325,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7325
7505
  func = ggml_cuda_mul_mat;
7326
7506
  break;
7327
7507
  case GGML_OP_SCALE:
7328
- if (!any_on_device) {
7329
- return false;
7330
- }
7331
7508
  func = ggml_cuda_scale;
7332
7509
  break;
7333
- case GGML_OP_CPY:
7510
+ case GGML_OP_CLAMP:
7334
7511
  if (!any_on_device) {
7335
7512
  return false;
7336
7513
  }
7514
+ func = ggml_cuda_clamp;
7515
+ break;
7516
+ case GGML_OP_CPY:
7337
7517
  func = ggml_cuda_cpy;
7338
7518
  break;
7339
7519
  case GGML_OP_CONT:
7340
- if (!any_on_device) {
7341
- return false;
7342
- }
7343
7520
  func = ggml_cuda_dup;
7344
7521
  break;
7345
7522
  case GGML_OP_RESHAPE:
7346
7523
  case GGML_OP_VIEW:
7347
7524
  case GGML_OP_PERMUTE:
7348
7525
  case GGML_OP_TRANSPOSE:
7349
- if (!any_on_device) {
7350
- return false;
7351
- }
7352
7526
  func = ggml_cuda_nop;
7353
7527
  break;
7354
7528
  case GGML_OP_DIAG_MASK_INF:
7355
- if (!any_on_device) {
7356
- return false;
7357
- }
7358
7529
  func = ggml_cuda_diag_mask_inf;
7359
7530
  break;
7360
7531
  case GGML_OP_SOFT_MAX:
7361
- if (!any_on_device) {
7362
- return false;
7363
- }
7364
7532
  func = ggml_cuda_soft_max;
7365
7533
  break;
7366
7534
  case GGML_OP_ROPE:
7367
- if (!any_on_device) {
7368
- return false;
7369
- }
7370
7535
  func = ggml_cuda_rope;
7371
7536
  break;
7372
7537
  case GGML_OP_ALIBI:
7373
- if (!any_on_device) {
7374
- return false;
7375
- }
7376
7538
  func = ggml_cuda_alibi;
7377
7539
  break;
7378
7540
  default:
@@ -7400,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7400
7562
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7401
7563
  snprintf(description, description_size, "%s", prop.name);
7402
7564
  }
7565
+
7566
+ ////////////////////////////////////////////////////////////////////////////////
7567
+
7568
+ // backend interface
7569
+
7570
+ #define UNUSED GGML_UNUSED
7571
+
7572
+ struct ggml_backend_context_cuda {
7573
+ };
7574
+
7575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7576
+ return GGML_CUDA_NAME;
7577
+
7578
+ UNUSED(backend);
7579
+ }
7580
+
7581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
7582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7583
+ delete cuda_ctx;
7584
+ delete backend;
7585
+ }
7586
+
7587
+ struct ggml_backend_buffer_context_cuda {
7588
+ void * device;
7589
+
7590
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7591
+ size_t temp_tensor_extra_index = 0;
7592
+
7593
+ ~ggml_backend_buffer_context_cuda() {
7594
+ delete[] temp_tensor_extras;
7595
+ }
7596
+
7597
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7598
+ if (temp_tensor_extras == nullptr) {
7599
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7600
+ }
7601
+
7602
+ size_t alloc_index = temp_tensor_extra_index;
7603
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7604
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7605
+ memset(extra, 0, sizeof(*extra));
7606
+
7607
+ return extra;
7608
+ }
7609
+ };
7610
+
7611
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7612
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7613
+ CUDA_CHECK(cudaFree(ctx->device));
7614
+ delete ctx;
7615
+ }
7616
+
7617
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7618
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7619
+ return ctx->device;
7620
+ }
7621
+
7622
+ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7623
+ int64_t row_low = 0;
7624
+ int64_t row_high = ggml_nrows(tensor);
7625
+ int64_t nrows_split = row_high - row_low;
7626
+
7627
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
7628
+
7629
+ int64_t ne0 = tensor->ne[0];
7630
+
7631
+ if (ggml_is_quantized(tensor->type)) {
7632
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
7633
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7634
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7635
+ }
7636
+ }
7637
+
7638
+ return size;
7639
+
7640
+ UNUSED(buffer);
7641
+ }
7642
+
7643
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7644
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7645
+
7646
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
7647
+ assert(tensor->view_src->buffer->backend == buffer->backend);
7648
+ tensor->backend = tensor->view_src->backend;
7649
+ tensor->extra = tensor->view_src->extra;
7650
+ return;
7651
+ }
7652
+
7653
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7654
+
7655
+ extra->data_device[g_main_device] = tensor->data;
7656
+
7657
+ tensor->backend = GGML_BACKEND_GPU;
7658
+ tensor->extra = extra;
7659
+
7660
+ if (ggml_is_quantized(tensor->type)) {
7661
+ // initialize padding to 0 to avoid possible NaN values
7662
+ int64_t row_low = 0;
7663
+ int64_t row_high = ggml_nrows(tensor);
7664
+ int64_t nrows_split = row_high - row_low;
7665
+
7666
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7667
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7668
+
7669
+ if (padded_size > original_size && tensor->view_src == nullptr) {
7670
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7671
+ }
7672
+ }
7673
+
7674
+ UNUSED(buffer);
7675
+ }
7676
+
7677
+ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7678
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7679
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7680
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7681
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7682
+ /* .free_tensor = */ NULL,
7683
+ };
7684
+
7685
+ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7686
+ ggml_cuda_set_device(g_main_device);
7687
+
7688
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7689
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
7690
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7691
+ }
7692
+
7693
+ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7694
+ return 128;
7695
+ UNUSED(backend);
7696
+ }
7697
+
7698
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7699
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7700
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7701
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7702
+
7703
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7704
+
7705
+ UNUSED(backend);
7706
+ }
7707
+
7708
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7709
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7710
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7711
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7712
+
7713
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7714
+
7715
+ UNUSED(backend);
7716
+ }
7717
+
7718
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7719
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7720
+
7721
+ UNUSED(backend);
7722
+ }
7723
+
7724
+ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7725
+ GGML_ASSERT(!"not implemented");
7726
+
7727
+ return nullptr;
7728
+
7729
+ UNUSED(backend);
7730
+ UNUSED(cgraph);
7731
+ }
7732
+
7733
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7734
+ GGML_ASSERT(!"not implemented");
7735
+
7736
+ UNUSED(backend);
7737
+ UNUSED(plan);
7738
+ }
7739
+
7740
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7741
+ GGML_ASSERT(!"not implemented");
7742
+
7743
+ UNUSED(backend);
7744
+ UNUSED(plan);
7745
+ }
7746
+
7747
+ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7748
+ ggml_cuda_set_device(g_main_device);
7749
+
7750
+ ggml_compute_params params = {};
7751
+ params.type = GGML_TASK_COMPUTE;
7752
+ params.ith = 0;
7753
+ for (int i = 0; i < cgraph->n_nodes; i++) {
7754
+ ggml_tensor * node = cgraph->nodes[i];
7755
+
7756
+ assert(node->backend == GGML_BACKEND_GPU);
7757
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
7758
+ if (node->src[j] != nullptr) {
7759
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
7760
+ }
7761
+ }
7762
+
7763
+ bool ok = ggml_cuda_compute_forward(&params, node);
7764
+ if (!ok) {
7765
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7766
+ }
7767
+ GGML_ASSERT(ok);
7768
+
7769
+ #if 0
7770
+ if (node->type == GGML_TYPE_F32) {
7771
+ cudaDeviceSynchronize();
7772
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
7773
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7774
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7775
+ ggml_type_name(node->src[0]->type),
7776
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7777
+ node->src[0]->name,
7778
+ node->src[1] ? node->src[1]->name : "none");
7779
+ double sum = 0.0;
7780
+ double sq_sum = 0.0;
7781
+ for (int i = 0; i < ggml_nelements(node); i++) {
7782
+ printf("%f ", tmp[i]);
7783
+ sum += tmp[i];
7784
+ sq_sum += tmp[i]*tmp[i];
7785
+ }
7786
+ printf("\n");
7787
+ printf("sum: %f, ", sum);
7788
+ printf("sq_sum: %f\n", sq_sum);
7789
+ }
7790
+ #endif
7791
+ }
7792
+
7793
+ UNUSED(backend);
7794
+ }
7795
+
7796
+ static ggml_backend_i cuda_backend_i = {
7797
+ /* .get_name = */ ggml_backend_cuda_name,
7798
+ /* .free = */ ggml_backend_cuda_free,
7799
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7800
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7801
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7802
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7803
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
7804
+ /* .cpy_tensor_from = */ nullptr,
7805
+ /* .cpy_tensor_to = */ nullptr,
7806
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7807
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7808
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7809
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7810
+ /* .supports_op = */ nullptr,
7811
+ };
7812
+
7813
+ ggml_backend_t ggml_backend_cuda_init() {
7814
+ ggml_init_cublas(); // TODO: remove from ggml.c
7815
+
7816
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7817
+
7818
+ ggml_backend_t cuda_backend = new ggml_backend {
7819
+ /* .interface = */ cuda_backend_i,
7820
+ /* .context = */ ctx
7821
+ };
7822
+
7823
+ return cuda_backend;
7824
+ }