llama_cpp 0.12.4 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
7693
7693
  *dsti = *xi;
7694
7694
  }
7695
7695
 
7696
+ static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
7697
+ const sycl::half *xi = (const sycl::half *)cxi;
7698
+ float *dsti = (float *)cdsti;
7699
+
7700
+ *dsti = *xi;
7701
+ }
7702
+
7696
7703
  static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
7697
7704
  const int16_t *xi = (const int16_t *)cxi;
7698
7705
  int16_t *dsti = (int16_t *)cdsti;
@@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
7709
7716
 
7710
7717
  template <cpy_kernel_t cpy_1>
7711
7718
  static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
7712
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7713
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
7714
- const sycl::nd_item<3> &item_ct1) {
7719
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7720
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
7721
+ const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
7715
7722
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
7716
7723
  item_ct1.get_local_id(2);
7717
7724
 
@@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
7721
7728
 
7722
7729
  // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
7723
7730
  // then combine those indices with the corresponding byte offsets to get the total offsets
7724
- const int i02 = i / (ne00*ne01);
7725
- const int i01 = (i - i02*ne01*ne00) / ne00;
7726
- const int i00 = i - i02*ne01*ne00 - i01*ne00;
7727
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
7728
-
7729
- const int i12 = i / (ne10*ne11);
7730
- const int i11 = (i - i12*ne10*ne11) / ne10;
7731
- const int i10 = i - i12*ne10*ne11 - i11*ne10;
7732
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
7731
+ const int i03 = i/(ne00 * ne01 * ne02);
7732
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
7733
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
7734
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
7735
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
7736
+
7737
+ const int i13 = i/(ne10 * ne11 * ne12);
7738
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
7739
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
7740
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
7741
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
7733
7742
 
7734
7743
  cpy_1(cx + x_offset, cdst + dst_offset);
7735
7744
  }
@@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
7823
7832
 
7824
7833
  template <cpy_kernel_t cpy_blck, int qk>
7825
7834
  static void cpy_f32_q(const char * cx, char * cdst, const int ne,
7826
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7827
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
7828
- const sycl::nd_item<3> &item_ct1) {
7835
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7836
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
7837
+ const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
7829
7838
  const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
7830
7839
  item_ct1.get_local_id(2)) *
7831
7840
  qk;
@@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
7834
7843
  return;
7835
7844
  }
7836
7845
 
7837
- const int i02 = i / (ne00*ne01);
7838
- const int i01 = (i - i02*ne01*ne00) / ne00;
7839
- const int i00 = (i - i02*ne01*ne00 - i01*ne00);
7840
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
7846
+ const int i03 = i/(ne00 * ne01 * ne02);
7847
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
7848
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
7849
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
7850
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
7841
7851
 
7842
- const int i12 = i / (ne10*ne11);
7843
- const int i11 = (i - i12*ne10*ne11) / ne10;
7844
- const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
7845
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
7852
+ const int i13 = i/(ne10 * ne11 * ne12);
7853
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
7854
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
7855
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
7856
+ const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
7846
7857
 
7847
7858
  cpy_blck(cx + x_offset, cdst + dst_offset);
7848
7859
  }
@@ -8247,7 +8258,8 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
8247
8258
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
8248
8259
  }
8249
8260
 
8250
- static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
8261
+ template <typename T>
8262
+ static void im2col_kernel(const float *x, T *dst, int offset_delta,
8251
8263
  int IW, int IH, int OW, int KW, int KH,
8252
8264
  int pelements, int CHW, int s0, int s1, int p0,
8253
8265
  int p1, int d0, int d1,
@@ -10598,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
10598
10610
 
10599
10611
  static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
10600
10612
  const int ne00, const int ne01,
10601
- const int nb00, const int nb01,
10602
- const int nb02, const int ne10,
10603
- const int ne11, const int nb10,
10604
- const int nb11, const int nb12,
10613
+ const int ne02, const int nb00,
10614
+ const int nb01, const int nb02,
10615
+ const int nb03, const int ne10,
10616
+ const int ne11, const int ne12,
10617
+ const int nb10, const int nb11,
10618
+ const int nb12, const int nb13,
10605
10619
  dpct::queue_ptr stream) {
10606
10620
 
10607
10621
  const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@@ -10614,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
10614
10628
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
10615
10629
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
10616
10630
  [=](sycl::nd_item<3> item_ct1) {
10617
- cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
10618
- nb02, ne10, ne11, nb10, nb11, nb12,
10631
+ cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10632
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10619
10633
  item_ct1);
10620
10634
  });
10621
10635
  }
@@ -10623,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
10623
10637
 
10624
10638
  static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
10625
10639
  const int ne00, const int ne01,
10626
- const int nb00, const int nb01,
10627
- const int nb02, const int ne10,
10628
- const int ne11, const int nb10,
10629
- const int nb11, const int nb12,
10640
+ const int ne02, const int nb00,
10641
+ const int nb01, const int nb02,
10642
+ const int nb03, const int ne10,
10643
+ const int ne11, const int ne12,
10644
+ const int nb10, const int nb11,
10645
+ const int nb12, const int nb13,
10630
10646
  dpct::queue_ptr stream) {
10631
10647
 
10632
10648
  const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@@ -10639,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
10639
10655
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
10640
10656
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
10641
10657
  [=](sycl::nd_item<3> item_ct1) {
10642
- cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
10643
- nb02, ne10, ne11, nb10, nb11, nb12,
10658
+ cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10659
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10644
10660
  item_ct1);
10645
10661
  });
10646
10662
  }
@@ -10648,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
10648
10664
 
10649
10665
  static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
10650
10666
  const int ne00, const int ne01,
10651
- const int nb00, const int nb01,
10652
- const int nb02, const int ne10,
10653
- const int ne11, const int nb10,
10654
- const int nb11, const int nb12,
10667
+ const int ne02, const int nb00,
10668
+ const int nb01, const int nb02,
10669
+ const int nb03, const int ne10,
10670
+ const int ne11, const int ne12,
10671
+ const int nb10, const int nb11,
10672
+ const int nb12, const int nb13,
10655
10673
  dpct::queue_ptr stream) {
10656
10674
 
10657
10675
  GGML_ASSERT(ne % QK8_0 == 0);
@@ -10660,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
10660
10678
  sycl::range<3>(1, 1, 1)),
10661
10679
  [=](sycl::nd_item<3> item_ct1) {
10662
10680
  cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
10663
- cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
10664
- ne10, ne11, nb10, nb11, nb12, item_ct1);
10681
+ cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10682
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10683
+ item_ct1);
10665
10684
  });
10666
10685
  }
10667
10686
 
10668
10687
  static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
10669
10688
  const int ne00, const int ne01,
10670
- const int nb00, const int nb01,
10671
- const int nb02, const int ne10,
10672
- const int ne11, const int nb10,
10673
- const int nb11, const int nb12,
10689
+ const int ne02, const int nb00,
10690
+ const int nb01, const int nb02,
10691
+ const int nb03, const int ne10,
10692
+ const int ne11, const int ne12,
10693
+ const int nb10, const int nb11,
10694
+ const int nb12, const int nb13,
10674
10695
  dpct::queue_ptr stream) {
10675
10696
 
10676
10697
  GGML_ASSERT(ne % QK4_0 == 0);
@@ -10679,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
10679
10700
  sycl::range<3>(1, 1, 1)),
10680
10701
  [=](sycl::nd_item<3> item_ct1) {
10681
10702
  cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
10682
- cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
10683
- ne10, ne11, nb10, nb11, nb12, item_ct1);
10703
+ cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10704
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10705
+ item_ct1);
10684
10706
  });
10685
10707
  }
10686
10708
 
10687
10709
  static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
10688
10710
  const int ne00, const int ne01,
10689
- const int nb00, const int nb01,
10690
- const int nb02, const int ne10,
10691
- const int ne11, const int nb10,
10692
- const int nb11, const int nb12,
10711
+ const int ne02, const int nb00,
10712
+ const int nb01, const int nb02,
10713
+ const int nb03, const int ne10,
10714
+ const int ne11, const int ne12,
10715
+ const int nb10, const int nb11,
10716
+ const int nb12, const int nb13,
10693
10717
  dpct::queue_ptr stream) {
10694
10718
 
10695
10719
  GGML_ASSERT(ne % QK4_1 == 0);
@@ -10698,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
10698
10722
  sycl::range<3>(1, 1, 1)),
10699
10723
  [=](sycl::nd_item<3> item_ct1) {
10700
10724
  cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
10701
- cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
10702
- ne10, ne11, nb10, nb11, nb12, item_ct1);
10725
+ cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10726
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10727
+ item_ct1);
10703
10728
  });
10704
10729
  }
10705
10730
 
10706
10731
  static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
10707
10732
  const int ne00, const int ne01,
10708
- const int nb00, const int nb01,
10709
- const int nb02, const int ne10,
10710
- const int ne11, const int nb10,
10711
- const int nb11, const int nb12,
10733
+ const int ne02, const int nb00,
10734
+ const int nb01, const int nb02,
10735
+ const int nb03, const int ne10,
10736
+ const int ne11, const int ne12,
10737
+ const int nb10, const int nb11,
10738
+ const int nb12, const int nb13,
10712
10739
  dpct::queue_ptr stream) {
10713
10740
 
10714
10741
  const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@@ -10721,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
10721
10748
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
10722
10749
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
10723
10750
  [=](sycl::nd_item<3> item_ct1) {
10724
- cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
10725
- nb02, ne10, ne11, nb10, nb11, nb12,
10751
+ cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10752
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10726
10753
  item_ct1);
10727
10754
  });
10728
10755
  }
@@ -10730,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
10730
10757
 
10731
10758
  static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
10732
10759
  const int ne00, const int ne01,
10733
- const int nb00, const int nb01,
10734
- const int nb02, const int ne10,
10735
- const int ne11, const int nb10,
10736
- const int nb11, const int nb12,
10760
+ const int ne02, const int nb00,
10761
+ const int nb01, const int nb02,
10762
+ const int nb03, const int ne10,
10763
+ const int ne11, const int ne12,
10764
+ const int nb10, const int nb11,
10765
+ const int nb12, const int nb13,
10737
10766
  dpct::queue_ptr stream) {
10738
10767
 
10739
10768
  const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@@ -10746,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
10746
10775
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
10747
10776
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
10748
10777
  [=](sycl::nd_item<3> item_ct1) {
10749
- cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
10750
- nb02, ne10, ne11, nb10, nb11, nb12,
10778
+ cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10779
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10751
10780
  item_ct1);
10752
10781
  });
10753
10782
  }
@@ -10755,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
10755
10784
 
10756
10785
  static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
10757
10786
  const int ne00, const int ne01,
10758
- const int nb00, const int nb01,
10759
- const int nb02, const int ne10,
10760
- const int ne11, const int nb10,
10761
- const int nb11, const int nb12,
10787
+ const int ne02, const int nb00,
10788
+ const int nb01, const int nb02,
10789
+ const int nb03, const int ne10,
10790
+ const int ne11, const int ne12,
10791
+ const int nb10, const int nb11,
10792
+ const int nb12, const int nb13,
10762
10793
  dpct::queue_ptr stream) {
10763
10794
 
10764
10795
  const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@@ -10771,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
10771
10802
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
10772
10803
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
10773
10804
  [=](sycl::nd_item<3> item_ct1) {
10774
- cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
10775
- nb02, ne10, ne11, nb10, nb11, nb12,
10805
+ cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
10806
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
10776
10807
  item_ct1);
10777
10808
  });
10778
10809
  }
@@ -11019,7 +11050,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
11019
11050
  });
11020
11051
  }
11021
11052
 
11022
- static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
11053
+ template <typename T>
11054
+ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
11023
11055
  int OW, int OH, int KW, int KH, int IC,
11024
11056
  int offset_delta, int s0, int s1, int p0,
11025
11057
  int p1, int d0, int d1,
@@ -11036,7 +11068,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
11036
11068
  sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
11037
11069
  sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
11038
11070
  [=](sycl::nd_item<3> item_ct1) {
11039
- im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
11071
+ im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
11040
11072
  parallel_elements, (IC * KH * KW), s0, s1, p0,
11041
11073
  p1, d0, d1, item_ct1);
11042
11074
  });
@@ -12116,7 +12148,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12116
12148
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
12117
12149
  const dpct::queue_ptr &stream) {
12118
12150
 
12119
- const int64_t ne00 = src0->ne[0];
12151
+ GGML_TENSOR_BINARY_OP_LOCALS
12152
+
12120
12153
  const int64_t row_diff = row_high - row_low;
12121
12154
 
12122
12155
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
@@ -12135,8 +12168,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12135
12168
  } else {
12136
12169
  src1_dfloat = src1_dfloat_a.alloc(ne00);
12137
12170
  ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
12138
- ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
12139
- sizeof(sycl::half), 0, 0, stream);
12171
+ ne00, ne00, ne01, ne02, nb00, nb01, nb02,
12172
+ nb03, ne10, ne11, ne12, nb10, nb11, nb12,
12173
+ nb13, stream);
12140
12174
  }
12141
12175
  }
12142
12176
  #else
@@ -12424,7 +12458,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
12424
12458
 
12425
12459
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12426
12460
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12427
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
12461
+ GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
12428
12462
 
12429
12463
  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12430
12464
  const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -12447,8 +12481,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
12447
12481
 
12448
12482
  const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
12449
12483
 
12450
- im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
12451
- IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
12484
+ if (dst->type == GGML_TYPE_F16) {
12485
+ im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
12486
+ } else {
12487
+ im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
12488
+ }
12452
12489
 
12453
12490
  (void) src0;
12454
12491
  (void) src0_dd;
@@ -13905,19 +13942,23 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13905
13942
 
13906
13943
  const int64_t ne00 = src0->ne[0];
13907
13944
  const int64_t ne01 = src0->ne[1];
13908
- GGML_ASSERT(src0->ne[3] == 1);
13945
+ const int64_t ne02 = src0->ne[2];
13946
+
13909
13947
 
13910
13948
  const int64_t nb00 = src0->nb[0];
13911
13949
  const int64_t nb01 = src0->nb[1];
13912
13950
  const int64_t nb02 = src0->nb[2];
13951
+ const int64_t nb03 = src0->nb[3];
13913
13952
 
13914
13953
  const int64_t ne10 = src1->ne[0];
13915
13954
  const int64_t ne11 = src1->ne[1];
13916
- GGML_ASSERT(src1->ne[3] == 1);
13955
+ const int64_t ne12 = src1->ne[2];
13956
+
13917
13957
 
13918
13958
  const int64_t nb10 = src1->nb[0];
13919
13959
  const int64_t nb11 = src1->nb[1];
13920
13960
  const int64_t nb12 = src1->nb[2];
13961
+ const int64_t nb13 = src1->nb[3];
13921
13962
 
13922
13963
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13923
13964
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -13929,21 +13970,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13929
13970
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
13930
13971
 
13931
13972
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
13932
- ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13973
+ ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13933
13974
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
13934
- ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13975
+ ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13935
13976
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
13936
- ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13977
+ ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13937
13978
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
13938
- ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13979
+ ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13939
13980
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
13940
- ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13981
+ ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13941
13982
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
13942
- ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13983
+ ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13943
13984
  } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
13944
- ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13985
+ ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13945
13986
  } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
13946
- ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
13987
+ ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
13947
13988
  } else {
13948
13989
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
13949
13990
  ggml_type_name(src0->type), ggml_type_name(src1->type));