llama_cpp 0.12.4 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +138 -53
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +39 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +68 -59
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +131 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1516 -10656
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1697 -1241
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +14 -9
- data/vendor/tmp/llama.cpp/ggml.c +13 -10
- data/vendor/tmp/llama.cpp/llama.cpp +266 -43
- data/vendor/tmp/llama.cpp/llama.h +1 -1
- metadata +2 -2
@@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
|
7693
7693
|
*dsti = *xi;
|
7694
7694
|
}
|
7695
7695
|
|
7696
|
+
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
7697
|
+
const sycl::half *xi = (const sycl::half *)cxi;
|
7698
|
+
float *dsti = (float *)cdsti;
|
7699
|
+
|
7700
|
+
*dsti = *xi;
|
7701
|
+
}
|
7702
|
+
|
7696
7703
|
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
7697
7704
|
const int16_t *xi = (const int16_t *)cxi;
|
7698
7705
|
int16_t *dsti = (int16_t *)cdsti;
|
@@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
|
7709
7716
|
|
7710
7717
|
template <cpy_kernel_t cpy_1>
|
7711
7718
|
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
7712
|
-
|
7713
|
-
|
7714
|
-
|
7719
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7720
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
7721
|
+
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
7715
7722
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
7716
7723
|
item_ct1.get_local_id(2);
|
7717
7724
|
|
@@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
7721
7728
|
|
7722
7729
|
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
7723
7730
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
7724
|
-
const int
|
7725
|
-
const int
|
7726
|
-
const int
|
7727
|
-
const int
|
7728
|
-
|
7729
|
-
|
7730
|
-
const int
|
7731
|
-
const int
|
7732
|
-
const int
|
7731
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
7732
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
7733
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
7734
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
7735
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
7736
|
+
|
7737
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
7738
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
7739
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
7740
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
7741
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
7733
7742
|
|
7734
7743
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
7735
7744
|
}
|
@@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
|
7823
7832
|
|
7824
7833
|
template <cpy_kernel_t cpy_blck, int qk>
|
7825
7834
|
static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
7826
|
-
|
7827
|
-
|
7828
|
-
|
7835
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7836
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
7837
|
+
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
7829
7838
|
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
7830
7839
|
item_ct1.get_local_id(2)) *
|
7831
7840
|
qk;
|
@@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
|
7834
7843
|
return;
|
7835
7844
|
}
|
7836
7845
|
|
7837
|
-
const int
|
7838
|
-
const int
|
7839
|
-
const int
|
7840
|
-
const int
|
7846
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
7847
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
7848
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
7849
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
7850
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
7841
7851
|
|
7842
|
-
const int
|
7843
|
-
const int
|
7844
|
-
const int
|
7845
|
-
const int
|
7852
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
7853
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
7854
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
7855
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
7856
|
+
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
7846
7857
|
|
7847
7858
|
cpy_blck(cx + x_offset, cdst + dst_offset);
|
7848
7859
|
}
|
@@ -8247,7 +8258,8 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
|
|
8247
8258
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
8248
8259
|
}
|
8249
8260
|
|
8250
|
-
|
8261
|
+
template <typename T>
|
8262
|
+
static void im2col_kernel(const float *x, T *dst, int offset_delta,
|
8251
8263
|
int IW, int IH, int OW, int KW, int KH,
|
8252
8264
|
int pelements, int CHW, int s0, int s1, int p0,
|
8253
8265
|
int p1, int d0, int d1,
|
@@ -10598,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
10598
10610
|
|
10599
10611
|
static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
10600
10612
|
const int ne00, const int ne01,
|
10601
|
-
const int
|
10602
|
-
const int
|
10603
|
-
const int
|
10604
|
-
const int
|
10613
|
+
const int ne02, const int nb00,
|
10614
|
+
const int nb01, const int nb02,
|
10615
|
+
const int nb03, const int ne10,
|
10616
|
+
const int ne11, const int ne12,
|
10617
|
+
const int nb10, const int nb11,
|
10618
|
+
const int nb12, const int nb13,
|
10605
10619
|
dpct::queue_ptr stream) {
|
10606
10620
|
|
10607
10621
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10614,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
|
10614
10628
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10615
10629
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10616
10630
|
[=](sycl::nd_item<3> item_ct1) {
|
10617
|
-
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10618
|
-
|
10631
|
+
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10632
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10619
10633
|
item_ct1);
|
10620
10634
|
});
|
10621
10635
|
}
|
@@ -10623,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
|
10623
10637
|
|
10624
10638
|
static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
10625
10639
|
const int ne00, const int ne01,
|
10626
|
-
const int
|
10627
|
-
const int
|
10628
|
-
const int
|
10629
|
-
const int
|
10640
|
+
const int ne02, const int nb00,
|
10641
|
+
const int nb01, const int nb02,
|
10642
|
+
const int nb03, const int ne10,
|
10643
|
+
const int ne11, const int ne12,
|
10644
|
+
const int nb10, const int nb11,
|
10645
|
+
const int nb12, const int nb13,
|
10630
10646
|
dpct::queue_ptr stream) {
|
10631
10647
|
|
10632
10648
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10639,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10639
10655
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10640
10656
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10641
10657
|
[=](sycl::nd_item<3> item_ct1) {
|
10642
|
-
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10643
|
-
|
10658
|
+
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10659
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10644
10660
|
item_ct1);
|
10645
10661
|
});
|
10646
10662
|
}
|
@@ -10648,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10648
10664
|
|
10649
10665
|
static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
10650
10666
|
const int ne00, const int ne01,
|
10651
|
-
const int
|
10652
|
-
const int
|
10653
|
-
const int
|
10654
|
-
const int
|
10667
|
+
const int ne02, const int nb00,
|
10668
|
+
const int nb01, const int nb02,
|
10669
|
+
const int nb03, const int ne10,
|
10670
|
+
const int ne11, const int ne12,
|
10671
|
+
const int nb10, const int nb11,
|
10672
|
+
const int nb12, const int nb13,
|
10655
10673
|
dpct::queue_ptr stream) {
|
10656
10674
|
|
10657
10675
|
GGML_ASSERT(ne % QK8_0 == 0);
|
@@ -10660,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
|
10660
10678
|
sycl::range<3>(1, 1, 1)),
|
10661
10679
|
[=](sycl::nd_item<3> item_ct1) {
|
10662
10680
|
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
|
10663
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10664
|
-
ne10, ne11, nb10, nb11, nb12,
|
10681
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10682
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10683
|
+
item_ct1);
|
10665
10684
|
});
|
10666
10685
|
}
|
10667
10686
|
|
10668
10687
|
static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
10669
10688
|
const int ne00, const int ne01,
|
10670
|
-
const int
|
10671
|
-
const int
|
10672
|
-
const int
|
10673
|
-
const int
|
10689
|
+
const int ne02, const int nb00,
|
10690
|
+
const int nb01, const int nb02,
|
10691
|
+
const int nb03, const int ne10,
|
10692
|
+
const int ne11, const int ne12,
|
10693
|
+
const int nb10, const int nb11,
|
10694
|
+
const int nb12, const int nb13,
|
10674
10695
|
dpct::queue_ptr stream) {
|
10675
10696
|
|
10676
10697
|
GGML_ASSERT(ne % QK4_0 == 0);
|
@@ -10679,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
|
10679
10700
|
sycl::range<3>(1, 1, 1)),
|
10680
10701
|
[=](sycl::nd_item<3> item_ct1) {
|
10681
10702
|
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
|
10682
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10683
|
-
ne10, ne11, nb10, nb11, nb12,
|
10703
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10704
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10705
|
+
item_ct1);
|
10684
10706
|
});
|
10685
10707
|
}
|
10686
10708
|
|
10687
10709
|
static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
10688
10710
|
const int ne00, const int ne01,
|
10689
|
-
const int
|
10690
|
-
const int
|
10691
|
-
const int
|
10692
|
-
const int
|
10711
|
+
const int ne02, const int nb00,
|
10712
|
+
const int nb01, const int nb02,
|
10713
|
+
const int nb03, const int ne10,
|
10714
|
+
const int ne11, const int ne12,
|
10715
|
+
const int nb10, const int nb11,
|
10716
|
+
const int nb12, const int nb13,
|
10693
10717
|
dpct::queue_ptr stream) {
|
10694
10718
|
|
10695
10719
|
GGML_ASSERT(ne % QK4_1 == 0);
|
@@ -10698,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
|
10698
10722
|
sycl::range<3>(1, 1, 1)),
|
10699
10723
|
[=](sycl::nd_item<3> item_ct1) {
|
10700
10724
|
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
|
10701
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10702
|
-
ne10, ne11, nb10, nb11, nb12,
|
10725
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10726
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10727
|
+
item_ct1);
|
10703
10728
|
});
|
10704
10729
|
}
|
10705
10730
|
|
10706
10731
|
static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
10707
10732
|
const int ne00, const int ne01,
|
10708
|
-
const int
|
10709
|
-
const int
|
10710
|
-
const int
|
10711
|
-
const int
|
10733
|
+
const int ne02, const int nb00,
|
10734
|
+
const int nb01, const int nb02,
|
10735
|
+
const int nb03, const int ne10,
|
10736
|
+
const int ne11, const int ne12,
|
10737
|
+
const int nb10, const int nb11,
|
10738
|
+
const int nb12, const int nb13,
|
10712
10739
|
dpct::queue_ptr stream) {
|
10713
10740
|
|
10714
10741
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10721,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10721
10748
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10722
10749
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10723
10750
|
[=](sycl::nd_item<3> item_ct1) {
|
10724
|
-
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10725
|
-
|
10751
|
+
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10752
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10726
10753
|
item_ct1);
|
10727
10754
|
});
|
10728
10755
|
}
|
@@ -10730,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10730
10757
|
|
10731
10758
|
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
10732
10759
|
const int ne00, const int ne01,
|
10733
|
-
const int
|
10734
|
-
const int
|
10735
|
-
const int
|
10736
|
-
const int
|
10760
|
+
const int ne02, const int nb00,
|
10761
|
+
const int nb01, const int nb02,
|
10762
|
+
const int nb03, const int ne10,
|
10763
|
+
const int ne11, const int ne12,
|
10764
|
+
const int nb10, const int nb11,
|
10765
|
+
const int nb12, const int nb13,
|
10737
10766
|
dpct::queue_ptr stream) {
|
10738
10767
|
|
10739
10768
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10746,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
|
10746
10775
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10747
10776
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10748
10777
|
[=](sycl::nd_item<3> item_ct1) {
|
10749
|
-
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10750
|
-
|
10778
|
+
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10779
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10751
10780
|
item_ct1);
|
10752
10781
|
});
|
10753
10782
|
}
|
@@ -10755,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
|
10755
10784
|
|
10756
10785
|
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
10757
10786
|
const int ne00, const int ne01,
|
10758
|
-
const int
|
10759
|
-
const int
|
10760
|
-
const int
|
10761
|
-
const int
|
10787
|
+
const int ne02, const int nb00,
|
10788
|
+
const int nb01, const int nb02,
|
10789
|
+
const int nb03, const int ne10,
|
10790
|
+
const int ne11, const int ne12,
|
10791
|
+
const int nb10, const int nb11,
|
10792
|
+
const int nb12, const int nb13,
|
10762
10793
|
dpct::queue_ptr stream) {
|
10763
10794
|
|
10764
10795
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10771,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
|
10771
10802
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10772
10803
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10773
10804
|
[=](sycl::nd_item<3> item_ct1) {
|
10774
|
-
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10775
|
-
|
10805
|
+
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10806
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10776
10807
|
item_ct1);
|
10777
10808
|
});
|
10778
10809
|
}
|
@@ -11019,7 +11050,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
|
11019
11050
|
});
|
11020
11051
|
}
|
11021
11052
|
|
11022
|
-
|
11053
|
+
template <typename T>
|
11054
|
+
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
11023
11055
|
int OW, int OH, int KW, int KH, int IC,
|
11024
11056
|
int offset_delta, int s0, int s1, int p0,
|
11025
11057
|
int p1, int d0, int d1,
|
@@ -11036,7 +11068,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
|
|
11036
11068
|
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
11037
11069
|
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
11038
11070
|
[=](sycl::nd_item<3> item_ct1) {
|
11039
|
-
|
11071
|
+
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
11040
11072
|
parallel_elements, (IC * KH * KW), s0, s1, p0,
|
11041
11073
|
p1, d0, d1, item_ct1);
|
11042
11074
|
});
|
@@ -12116,7 +12148,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12116
12148
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
12117
12149
|
const dpct::queue_ptr &stream) {
|
12118
12150
|
|
12119
|
-
|
12151
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12152
|
+
|
12120
12153
|
const int64_t row_diff = row_high - row_low;
|
12121
12154
|
|
12122
12155
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
@@ -12135,8 +12168,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12135
12168
|
} else {
|
12136
12169
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
12137
12170
|
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
|
12138
|
-
ne00, ne00,
|
12139
|
-
|
12171
|
+
ne00, ne00, ne01, ne02, nb00, nb01, nb02,
|
12172
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12,
|
12173
|
+
nb13, stream);
|
12140
12174
|
}
|
12141
12175
|
}
|
12142
12176
|
#else
|
@@ -12424,7 +12458,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
|
12424
12458
|
|
12425
12459
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12426
12460
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12427
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
12461
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
12428
12462
|
|
12429
12463
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12430
12464
|
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
@@ -12447,8 +12481,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
|
12447
12481
|
|
12448
12482
|
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
12449
12483
|
|
12450
|
-
|
12451
|
-
|
12484
|
+
if (dst->type == GGML_TYPE_F16) {
|
12485
|
+
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
12486
|
+
} else {
|
12487
|
+
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
12488
|
+
}
|
12452
12489
|
|
12453
12490
|
(void) src0;
|
12454
12491
|
(void) src0_dd;
|
@@ -13905,19 +13942,23 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13905
13942
|
|
13906
13943
|
const int64_t ne00 = src0->ne[0];
|
13907
13944
|
const int64_t ne01 = src0->ne[1];
|
13908
|
-
|
13945
|
+
const int64_t ne02 = src0->ne[2];
|
13946
|
+
|
13909
13947
|
|
13910
13948
|
const int64_t nb00 = src0->nb[0];
|
13911
13949
|
const int64_t nb01 = src0->nb[1];
|
13912
13950
|
const int64_t nb02 = src0->nb[2];
|
13951
|
+
const int64_t nb03 = src0->nb[3];
|
13913
13952
|
|
13914
13953
|
const int64_t ne10 = src1->ne[0];
|
13915
13954
|
const int64_t ne11 = src1->ne[1];
|
13916
|
-
|
13955
|
+
const int64_t ne12 = src1->ne[2];
|
13956
|
+
|
13917
13957
|
|
13918
13958
|
const int64_t nb10 = src1->nb[0];
|
13919
13959
|
const int64_t nb11 = src1->nb[1];
|
13920
13960
|
const int64_t nb12 = src1->nb[2];
|
13961
|
+
const int64_t nb13 = src1->nb[3];
|
13921
13962
|
|
13922
13963
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13923
13964
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -13929,21 +13970,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13929
13970
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
|
13930
13971
|
|
13931
13972
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
13932
|
-
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13973
|
+
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13933
13974
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
13934
|
-
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13975
|
+
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13935
13976
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
13936
|
-
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13977
|
+
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13937
13978
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
13938
|
-
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13979
|
+
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13939
13980
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
13940
|
-
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13981
|
+
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13941
13982
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
13942
|
-
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13983
|
+
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13943
13984
|
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
13944
|
-
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13985
|
+
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13945
13986
|
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
13946
|
-
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13987
|
+
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13947
13988
|
} else {
|
13948
13989
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
13949
13990
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|