llama_cpp 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,7 @@
62
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
63
  #define cudaMemcpyKind hipMemcpyKind
64
64
  #define cudaMemset hipMemset
65
+ #define cudaMemsetAsync hipMemsetAsync
65
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
67
  #define cudaSetDevice hipSetDevice
67
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
414
415
  #define CUDA_SILU_BLOCK_SIZE 256
415
416
  #define CUDA_CPY_BLOCK_SIZE 32
416
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
+ #define CUDA_CLAMP_BLOCK_SIZE 256
417
419
  #define CUDA_ROPE_BLOCK_SIZE 256
418
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
422
425
 
423
426
  // dmmv = dequantize_mul_mat_vec
424
427
  #ifndef GGML_CUDA_DMMV_X
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1574
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1575
1578
  }
1576
1579
 
1580
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
+ static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
+
1585
+ if (col >= ncols) {
1586
+ return;
1587
+ }
1588
+
1589
+ const int r = y[row];
1590
+
1591
+ // copy x[r*ncols + col] to dst[row*ncols + col]
1592
+ const int xi = r*ncols + col;
1593
+ const int di = row*ncols + col;
1594
+
1595
+ const int ib = xi/qk; // block index
1596
+ const int iqs = (xi%qk)/qr; // quant index
1597
+ const int iybs = di - di%qk; // y block start index
1598
+ const int y_offset = qr == 1 ? 1 : qk/2;
1599
+
1600
+ // dequantize
1601
+ dfloat2 v;
1602
+ dequantize_kernel(x, ib, iqs, v);
1603
+
1604
+ dst[iybs + iqs + 0] = v.x;
1605
+ dst[iybs + iqs + y_offset] = v.y;
1606
+ }
1607
+
1577
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1579
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4555
4586
  dst[i] = scale * x[i];
4556
4587
  }
4557
4588
 
4589
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
+
4592
+ if (i >= k) {
4593
+ return;
4594
+ }
4595
+
4596
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
+ }
4598
+
4599
+ template<int qk, int qr, dequantize_kernel_t dq>
4600
+ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
+ const dim3 block_nums(block_num_x, nrows, 1);
4604
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
+ }
4606
+
4558
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4559
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4560
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5436
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5437
5486
  }
5438
5487
 
5488
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
+ }
5492
+
5439
5493
  template<typename T>
5440
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5703,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5703
5757
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5704
5758
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5705
5759
  kind = cudaMemcpyDeviceToDevice;
5706
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5760
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5707
5761
  int id;
5708
5762
  CUDA_CHECK(cudaGetDevice(&id));
5709
5763
  src_ptr = (char *) extra->data_device[id];
@@ -5739,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5739
5793
  }
5740
5794
  }
5741
5795
 
5796
+ static void ggml_cuda_op_repeat(
5797
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5798
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5799
+ // guaranteed to be an integer due to the check in ggml_can_repeat
5800
+ const int64_t ne0 = dst->ne[0];
5801
+ const int64_t ne1 = dst->ne[1];
5802
+ const int64_t ne2 = dst->ne[2];
5803
+ const int64_t ne3 = dst->ne[3];
5804
+
5805
+ const int64_t ne00 = src0->ne[0];
5806
+ const int64_t ne01 = src0->ne[1];
5807
+ const int64_t ne02 = src0->ne[2];
5808
+ const int64_t ne03 = src0->ne[3];
5809
+
5810
+ const size_t nb0 = dst->nb[0];
5811
+ const size_t nb1 = dst->nb[1];
5812
+ const size_t nb2 = dst->nb[2];
5813
+ const size_t nb3 = dst->nb[3];
5814
+
5815
+ const size_t nb00 = src0->nb[0];
5816
+ const size_t nb01 = src0->nb[1];
5817
+ const size_t nb02 = src0->nb[2];
5818
+ const size_t nb03 = src0->nb[3];
5819
+
5820
+ const int nr0 = (int)(ne0/ne00);
5821
+ const int nr1 = (int)(ne1/ne01);
5822
+ const int nr2 = (int)(ne2/ne02);
5823
+ const int nr3 = (int)(ne3/ne03);
5824
+
5825
+ // TODO: support for transposed / permuted tensors
5826
+ GGML_ASSERT(nb0 == sizeof(float));
5827
+ GGML_ASSERT(nb00 == sizeof(float));
5828
+
5829
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5830
+ for (int i3 = 0; i3 < nr3; i3++) {
5831
+ for (int k3 = 0; k3 < ne03; k3++) {
5832
+ for (int i2 = 0; i2 < nr2; i2++) {
5833
+ for (int k2 = 0; k2 < ne02; k2++) {
5834
+ for (int i1 = 0; i1 < nr1; i1++) {
5835
+ for (int k1 = 0; k1 < ne01; k1++) {
5836
+ for (int i0 = 0; i0 < nr0; i0++) {
5837
+ CUDA_CHECK(cudaMemcpyAsync(
5838
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5839
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5840
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5841
+ }
5842
+ }
5843
+ }
5844
+ }
5845
+ }
5846
+ }
5847
+ }
5848
+
5849
+ (void) src1;
5850
+ (void) src1_d;
5851
+ }
5852
+
5853
+ static void ggml_cuda_op_get_rows(
5854
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5855
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5856
+
5857
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5858
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
5859
+ GGML_ASSERT(ggml_is_contiguous(src0));
5860
+ GGML_ASSERT(ggml_is_contiguous(src1));
5861
+ GGML_ASSERT(ggml_is_contiguous(dst));
5862
+
5863
+ const int ncols = src0->ne[0];
5864
+ const int nrows = ggml_nelements(src1);
5865
+
5866
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
5867
+
5868
+ switch (src0->type) {
5869
+ case GGML_TYPE_F16:
5870
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5871
+ break;
5872
+ case GGML_TYPE_F32:
5873
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5874
+ break;
5875
+ case GGML_TYPE_Q4_0:
5876
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5877
+ break;
5878
+ case GGML_TYPE_Q4_1:
5879
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5880
+ break;
5881
+ case GGML_TYPE_Q5_0:
5882
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5883
+ break;
5884
+ case GGML_TYPE_Q5_1:
5885
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5886
+ break;
5887
+ case GGML_TYPE_Q8_0:
5888
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5889
+ break;
5890
+ default:
5891
+ // TODO: k-quants
5892
+ GGML_ASSERT(false);
5893
+ break;
5894
+ }
5895
+ }
5896
+
5742
5897
  inline void ggml_cuda_op_add(
5743
5898
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5744
5899
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6279,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
6279
6434
  const int64_t ne02 = src0->ne[2];
6280
6435
  const int64_t nrows = ggml_nrows(src0);
6281
6436
 
6282
- const int n_past = ((int32_t *) dst->op_params)[0];
6437
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6283
6438
  const int n_head = ((int32_t *) dst->op_params)[1];
6284
6439
  float max_bias;
6285
6440
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6286
6441
 
6287
- GGML_ASSERT(ne01 + n_past == ne00);
6442
+ //GGML_ASSERT(ne01 + n_past == ne00);
6288
6443
  GGML_ASSERT(n_head == ne02);
6289
6444
 
6290
6445
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6343,7 +6498,14 @@ inline void ggml_cuda_op_scale(
6343
6498
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6344
6499
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6345
6500
 
6346
- const float scale = ((float *) src1->data)[0];
6501
+ float scale;
6502
+ // HACK: support for ggml backend interface
6503
+ if (src1->backend == GGML_BACKEND_CPU) {
6504
+ scale = ((float *) src1->data)[0];
6505
+ } else {
6506
+ // TODO: pass pointer to kernel instead of copying to host
6507
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6508
+ }
6347
6509
 
6348
6510
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6349
6511
  CUDA_CHECK(cudaGetLastError());
@@ -6353,6 +6515,24 @@ inline void ggml_cuda_op_scale(
6353
6515
  (void) src1_dd;
6354
6516
  }
6355
6517
 
6518
+ inline void ggml_cuda_op_clamp(
6519
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6520
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6521
+
6522
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
+
6525
+ const float min = ((float *) dst->op_params)[0];
6526
+ const float max = ((float *) dst->op_params)[1];
6527
+
6528
+ clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
+ CUDA_CHECK(cudaGetLastError());
6530
+
6531
+ (void) src1;
6532
+ (void) dst;
6533
+ (void) src1_dd;
6534
+ }
6535
+
6356
6536
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6357
6537
  const int64_t nrows0 = ggml_nrows(src0);
6358
6538
 
@@ -6362,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6362
6542
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6363
6543
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6364
6544
 
6365
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6367
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6545
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6546
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6547
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6368
6548
 
6369
6549
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6370
6550
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6505,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
6505
6685
  const size_t q8_1_ts = sizeof(block_q8_1);
6506
6686
  const size_t q8_1_bs = QK8_1;
6507
6687
 
6508
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6509
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6510
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6688
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6689
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6690
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6511
6691
 
6512
6692
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6513
6693
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6585,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
6585
6765
  if (convert_src1_to_q8_1) {
6586
6766
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6587
6767
 
6588
- if (split && src1_on_device && src1_is_contiguous) {
6768
+ if (src1_on_device && src1_is_contiguous) {
6589
6769
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6590
6770
  CUDA_CHECK(cudaGetLastError());
6591
6771
  }
@@ -6667,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
6667
6847
  GGML_ASSERT(false);
6668
6848
  }
6669
6849
 
6670
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6850
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6671
6851
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6672
6852
  CUDA_CHECK(cudaGetLastError());
6673
6853
  }
@@ -6758,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
6758
6938
  }
6759
6939
  }
6760
6940
 
6941
+ static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6943
+ }
6944
+
6945
+ static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6947
+ }
6948
+
6761
6949
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6762
6950
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6763
6951
  }
@@ -6812,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6812
7000
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6813
7001
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6814
7002
 
6815
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7003
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6816
7004
  void * src0_ddq = src0_extra->data_device[g_main_device];
6817
7005
 
6818
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7006
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6819
7007
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6820
7008
 
6821
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7009
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6822
7010
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6823
7011
 
6824
7012
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6843,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
6843
7031
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6844
7032
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6845
7033
 
6846
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7034
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6847
7035
  void * src0_ddq = src0_extra->data_device[g_main_device];
6848
7036
 
6849
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7037
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6850
7038
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6851
7039
 
6852
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7040
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6853
7041
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6854
7042
 
6855
7043
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6870,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
6870
7058
  }
6871
7059
  }
6872
7060
 
6873
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7061
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6874
7062
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6875
7063
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6876
7064
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6877
- }else if (src0->type == GGML_TYPE_F32) {
7065
+ } else if (src0->type == GGML_TYPE_F32) {
6878
7066
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6879
7067
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6880
7068
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6906,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
6906
7094
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6907
7095
  }
6908
7096
 
7097
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7099
+ }
7100
+
6909
7101
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6910
7102
  const int64_t ne = ggml_nelements(src0);
6911
7103
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -6935,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
6935
7127
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6936
7128
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6937
7129
 
6938
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6939
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7130
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7131
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6940
7132
 
6941
7133
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6942
7134
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6991,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6991
7183
 
6992
7184
  const size_t nb1 = tensor->nb[1];
6993
7185
 
6994
- ggml_backend backend = tensor->backend;
6995
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7186
+ ggml_backend_type backend = tensor->backend;
7187
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6996
7188
  memset(extra, 0, sizeof(*extra));
6997
7189
 
6998
7190
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7046,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7046
7238
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7047
7239
  }
7048
7240
 
7049
-
7050
7241
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7051
7242
 
7052
7243
  extra->data_device[id] = buf;
@@ -7085,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7085
7276
  delete extra;
7086
7277
  }
7087
7278
 
7088
- static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7279
+ static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7089
7280
  static size_t g_temp_tensor_extra_index = 0;
7090
7281
 
7091
- static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7282
+ static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7092
7283
  if (g_temp_tensor_extras == nullptr) {
7093
7284
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7094
7285
  }
7095
7286
 
7096
7287
  size_t alloc_index = g_temp_tensor_extra_index;
7097
7288
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7098
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7289
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7099
7290
  memset(extra, 0, sizeof(*extra));
7100
7291
 
7101
7292
  return extra;
@@ -7123,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7123
7314
  return;
7124
7315
  }
7125
7316
 
7126
- struct ggml_tensor_extra_gpu * extra;
7317
+ ggml_tensor_extra_gpu * extra;
7127
7318
 
7128
7319
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7129
7320
  tensor->op == GGML_OP_VIEW ||
@@ -7132,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7132
7323
 
7133
7324
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7134
7325
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7135
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7326
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7136
7327
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7137
7328
  size_t offset = 0;
7138
7329
  if (tensor->op == GGML_OP_VIEW) {
@@ -7141,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7141
7332
  extra = ggml_cuda_alloc_temp_tensor_extra();
7142
7333
  extra->data_device[g_main_device] = src0_ddc + offset;
7143
7334
  } else if (tensor->op == GGML_OP_CPY) {
7144
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7335
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7145
7336
  void * src1_ddv = src1_extra->data_device[g_main_device];
7146
7337
  extra = ggml_cuda_alloc_temp_tensor_extra();
7147
7338
  extra->data_device[g_main_device] = src1_ddv;
@@ -7183,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7183
7374
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7184
7375
  }
7185
7376
 
7186
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7377
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7187
7378
 
7188
7379
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7189
7380
  tensor->op == GGML_OP_VIEW;
7190
7381
 
7191
7382
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7192
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7383
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7193
7384
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7194
7385
  size_t view_offset = 0;
7195
7386
  if (tensor->op == GGML_OP_VIEW) {
@@ -7207,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7207
7398
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7208
7399
  GGML_ASSERT(ggml_is_contiguous(tensor));
7209
7400
 
7210
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7401
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7211
7402
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7212
7403
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7213
7404
  }
@@ -7264,58 +7455,47 @@ void ggml_cuda_free_scratch() {
7264
7455
  g_scratch_buffer = nullptr;
7265
7456
  }
7266
7457
 
7267
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7458
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7268
7459
  ggml_cuda_func_t func;
7269
7460
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7270
7461
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7271
7462
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7272
7463
 
7464
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7465
+ return false;
7466
+ }
7467
+
7273
7468
  switch (tensor->op) {
7469
+ case GGML_OP_REPEAT:
7470
+ func = ggml_cuda_repeat;
7471
+ break;
7472
+ case GGML_OP_GET_ROWS:
7473
+ func = ggml_cuda_get_rows;
7474
+ break;
7274
7475
  case GGML_OP_DUP:
7275
- if (!any_on_device) {
7276
- return false;
7277
- }
7278
7476
  func = ggml_cuda_dup;
7279
7477
  break;
7280
7478
  case GGML_OP_ADD:
7281
- if (!any_on_device) {
7282
- return false;
7283
- }
7284
7479
  func = ggml_cuda_add;
7285
7480
  break;
7286
7481
  case GGML_OP_MUL:
7287
- if (!any_on_device) {
7288
- return false;
7289
- }
7290
7482
  func = ggml_cuda_mul;
7291
7483
  break;
7292
7484
  case GGML_OP_UNARY:
7293
7485
  switch (ggml_get_unary_op(tensor)) {
7294
7486
  case GGML_UNARY_OP_GELU:
7295
- if (!any_on_device) {
7296
- return false;
7297
- }
7298
7487
  func = ggml_cuda_gelu;
7299
7488
  break;
7300
7489
  case GGML_UNARY_OP_SILU:
7301
- if (!any_on_device) {
7302
- return false;
7303
- }
7304
7490
  func = ggml_cuda_silu;
7305
7491
  break;
7306
7492
  default:
7307
7493
  return false;
7308
7494
  } break;
7309
7495
  case GGML_OP_NORM:
7310
- if (!any_on_device) {
7311
- return false;
7312
- }
7313
7496
  func = ggml_cuda_norm;
7314
7497
  break;
7315
7498
  case GGML_OP_RMS_NORM:
7316
- if (!any_on_device) {
7317
- return false;
7318
- }
7319
7499
  func = ggml_cuda_rms_norm;
7320
7500
  break;
7321
7501
  case GGML_OP_MUL_MAT:
@@ -7325,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7325
7505
  func = ggml_cuda_mul_mat;
7326
7506
  break;
7327
7507
  case GGML_OP_SCALE:
7328
- if (!any_on_device) {
7329
- return false;
7330
- }
7331
7508
  func = ggml_cuda_scale;
7332
7509
  break;
7333
- case GGML_OP_CPY:
7510
+ case GGML_OP_CLAMP:
7334
7511
  if (!any_on_device) {
7335
7512
  return false;
7336
7513
  }
7514
+ func = ggml_cuda_clamp;
7515
+ break;
7516
+ case GGML_OP_CPY:
7337
7517
  func = ggml_cuda_cpy;
7338
7518
  break;
7339
7519
  case GGML_OP_CONT:
7340
- if (!any_on_device) {
7341
- return false;
7342
- }
7343
7520
  func = ggml_cuda_dup;
7344
7521
  break;
7345
7522
  case GGML_OP_RESHAPE:
7346
7523
  case GGML_OP_VIEW:
7347
7524
  case GGML_OP_PERMUTE:
7348
7525
  case GGML_OP_TRANSPOSE:
7349
- if (!any_on_device) {
7350
- return false;
7351
- }
7352
7526
  func = ggml_cuda_nop;
7353
7527
  break;
7354
7528
  case GGML_OP_DIAG_MASK_INF:
7355
- if (!any_on_device) {
7356
- return false;
7357
- }
7358
7529
  func = ggml_cuda_diag_mask_inf;
7359
7530
  break;
7360
7531
  case GGML_OP_SOFT_MAX:
7361
- if (!any_on_device) {
7362
- return false;
7363
- }
7364
7532
  func = ggml_cuda_soft_max;
7365
7533
  break;
7366
7534
  case GGML_OP_ROPE:
7367
- if (!any_on_device) {
7368
- return false;
7369
- }
7370
7535
  func = ggml_cuda_rope;
7371
7536
  break;
7372
7537
  case GGML_OP_ALIBI:
7373
- if (!any_on_device) {
7374
- return false;
7375
- }
7376
7538
  func = ggml_cuda_alibi;
7377
7539
  break;
7378
7540
  default:
@@ -7400,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7400
7562
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7401
7563
  snprintf(description, description_size, "%s", prop.name);
7402
7564
  }
7565
+
7566
+ ////////////////////////////////////////////////////////////////////////////////
7567
+
7568
+ // backend interface
7569
+
7570
+ #define UNUSED GGML_UNUSED
7571
+
7572
+ struct ggml_backend_context_cuda {
7573
+ };
7574
+
7575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7576
+ return GGML_CUDA_NAME;
7577
+
7578
+ UNUSED(backend);
7579
+ }
7580
+
7581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
7582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7583
+ delete cuda_ctx;
7584
+ delete backend;
7585
+ }
7586
+
7587
+ struct ggml_backend_buffer_context_cuda {
7588
+ void * device;
7589
+
7590
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7591
+ size_t temp_tensor_extra_index = 0;
7592
+
7593
+ ~ggml_backend_buffer_context_cuda() {
7594
+ delete[] temp_tensor_extras;
7595
+ }
7596
+
7597
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7598
+ if (temp_tensor_extras == nullptr) {
7599
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7600
+ }
7601
+
7602
+ size_t alloc_index = temp_tensor_extra_index;
7603
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7604
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7605
+ memset(extra, 0, sizeof(*extra));
7606
+
7607
+ return extra;
7608
+ }
7609
+ };
7610
+
7611
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7612
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7613
+ CUDA_CHECK(cudaFree(ctx->device));
7614
+ delete ctx;
7615
+ }
7616
+
7617
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7618
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7619
+ return ctx->device;
7620
+ }
7621
+
7622
+ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7623
+ int64_t row_low = 0;
7624
+ int64_t row_high = ggml_nrows(tensor);
7625
+ int64_t nrows_split = row_high - row_low;
7626
+
7627
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
7628
+
7629
+ int64_t ne0 = tensor->ne[0];
7630
+
7631
+ if (ggml_is_quantized(tensor->type)) {
7632
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
7633
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7634
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7635
+ }
7636
+ }
7637
+
7638
+ return size;
7639
+
7640
+ UNUSED(buffer);
7641
+ }
7642
+
7643
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7644
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7645
+
7646
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
7647
+ assert(tensor->view_src->buffer->backend == buffer->backend);
7648
+ tensor->backend = tensor->view_src->backend;
7649
+ tensor->extra = tensor->view_src->extra;
7650
+ return;
7651
+ }
7652
+
7653
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7654
+
7655
+ extra->data_device[g_main_device] = tensor->data;
7656
+
7657
+ tensor->backend = GGML_BACKEND_GPU;
7658
+ tensor->extra = extra;
7659
+
7660
+ if (ggml_is_quantized(tensor->type)) {
7661
+ // initialize padding to 0 to avoid possible NaN values
7662
+ int64_t row_low = 0;
7663
+ int64_t row_high = ggml_nrows(tensor);
7664
+ int64_t nrows_split = row_high - row_low;
7665
+
7666
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7667
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7668
+
7669
+ if (padded_size > original_size && tensor->view_src == nullptr) {
7670
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7671
+ }
7672
+ }
7673
+
7674
+ UNUSED(buffer);
7675
+ }
7676
+
7677
+ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7678
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7679
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7680
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7681
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7682
+ /* .free_tensor = */ NULL,
7683
+ };
7684
+
7685
+ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7686
+ ggml_cuda_set_device(g_main_device);
7687
+
7688
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7689
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
7690
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7691
+ }
7692
+
7693
+ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7694
+ return 128;
7695
+ UNUSED(backend);
7696
+ }
7697
+
7698
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7699
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7700
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7701
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7702
+
7703
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7704
+
7705
+ UNUSED(backend);
7706
+ }
7707
+
7708
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7709
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7710
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7711
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7712
+
7713
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7714
+
7715
+ UNUSED(backend);
7716
+ }
7717
+
7718
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7719
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7720
+
7721
+ UNUSED(backend);
7722
+ }
7723
+
7724
+ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7725
+ GGML_ASSERT(!"not implemented");
7726
+
7727
+ return nullptr;
7728
+
7729
+ UNUSED(backend);
7730
+ UNUSED(cgraph);
7731
+ }
7732
+
7733
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7734
+ GGML_ASSERT(!"not implemented");
7735
+
7736
+ UNUSED(backend);
7737
+ UNUSED(plan);
7738
+ }
7739
+
7740
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7741
+ GGML_ASSERT(!"not implemented");
7742
+
7743
+ UNUSED(backend);
7744
+ UNUSED(plan);
7745
+ }
7746
+
7747
+ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7748
+ ggml_cuda_set_device(g_main_device);
7749
+
7750
+ ggml_compute_params params = {};
7751
+ params.type = GGML_TASK_COMPUTE;
7752
+ params.ith = 0;
7753
+ for (int i = 0; i < cgraph->n_nodes; i++) {
7754
+ ggml_tensor * node = cgraph->nodes[i];
7755
+
7756
+ assert(node->backend == GGML_BACKEND_GPU);
7757
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
7758
+ if (node->src[j] != nullptr) {
7759
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
7760
+ }
7761
+ }
7762
+
7763
+ bool ok = ggml_cuda_compute_forward(&params, node);
7764
+ if (!ok) {
7765
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7766
+ }
7767
+ GGML_ASSERT(ok);
7768
+
7769
+ #if 0
7770
+ if (node->type == GGML_TYPE_F32) {
7771
+ cudaDeviceSynchronize();
7772
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
7773
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7774
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7775
+ ggml_type_name(node->src[0]->type),
7776
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7777
+ node->src[0]->name,
7778
+ node->src[1] ? node->src[1]->name : "none");
7779
+ double sum = 0.0;
7780
+ double sq_sum = 0.0;
7781
+ for (int i = 0; i < ggml_nelements(node); i++) {
7782
+ printf("%f ", tmp[i]);
7783
+ sum += tmp[i];
7784
+ sq_sum += tmp[i]*tmp[i];
7785
+ }
7786
+ printf("\n");
7787
+ printf("sum: %f, ", sum);
7788
+ printf("sq_sum: %f\n", sq_sum);
7789
+ }
7790
+ #endif
7791
+ }
7792
+
7793
+ UNUSED(backend);
7794
+ }
7795
+
7796
+ static ggml_backend_i cuda_backend_i = {
7797
+ /* .get_name = */ ggml_backend_cuda_name,
7798
+ /* .free = */ ggml_backend_cuda_free,
7799
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7800
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7801
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7802
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7803
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
7804
+ /* .cpy_tensor_from = */ nullptr,
7805
+ /* .cpy_tensor_to = */ nullptr,
7806
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7807
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7808
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7809
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7810
+ /* .supports_op = */ nullptr,
7811
+ };
7812
+
7813
+ ggml_backend_t ggml_backend_cuda_init() {
7814
+ ggml_init_cublas(); // TODO: remove from ggml.c
7815
+
7816
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7817
+
7818
+ ggml_backend_t cuda_backend = new ggml_backend {
7819
+ /* .interface = */ cuda_backend_i,
7820
+ /* .context = */ ctx
7821
+ };
7822
+
7823
+ return cuda_backend;
7824
+ }