llama_cpp 0.12.4 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +146 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +386 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +68 -59
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +139 -145
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1516 -10656
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1777 -1238
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +14 -9
- data/vendor/tmp/llama.cpp/ggml.c +147 -70
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +920 -173
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
|
7693
7693
|
*dsti = *xi;
|
7694
7694
|
}
|
7695
7695
|
|
7696
|
+
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
7697
|
+
const sycl::half *xi = (const sycl::half *)cxi;
|
7698
|
+
float *dsti = (float *)cdsti;
|
7699
|
+
|
7700
|
+
*dsti = *xi;
|
7701
|
+
}
|
7702
|
+
|
7696
7703
|
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
7697
7704
|
const int16_t *xi = (const int16_t *)cxi;
|
7698
7705
|
int16_t *dsti = (int16_t *)cdsti;
|
@@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
|
7709
7716
|
|
7710
7717
|
template <cpy_kernel_t cpy_1>
|
7711
7718
|
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
7712
|
-
|
7713
|
-
|
7714
|
-
|
7719
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7720
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
7721
|
+
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
7715
7722
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
7716
7723
|
item_ct1.get_local_id(2);
|
7717
7724
|
|
@@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
7721
7728
|
|
7722
7729
|
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
7723
7730
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
7724
|
-
const int
|
7725
|
-
const int
|
7726
|
-
const int
|
7727
|
-
const int
|
7728
|
-
|
7729
|
-
|
7730
|
-
const int
|
7731
|
-
const int
|
7732
|
-
const int
|
7731
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
7732
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
7733
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
7734
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
7735
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
7736
|
+
|
7737
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
7738
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
7739
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
7740
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
7741
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
7733
7742
|
|
7734
7743
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
7735
7744
|
}
|
@@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
|
7823
7832
|
|
7824
7833
|
template <cpy_kernel_t cpy_blck, int qk>
|
7825
7834
|
static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
7826
|
-
|
7827
|
-
|
7828
|
-
|
7835
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7836
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
7837
|
+
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
7829
7838
|
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
7830
7839
|
item_ct1.get_local_id(2)) *
|
7831
7840
|
qk;
|
@@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
|
7834
7843
|
return;
|
7835
7844
|
}
|
7836
7845
|
|
7837
|
-
const int
|
7838
|
-
const int
|
7839
|
-
const int
|
7840
|
-
const int
|
7846
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
7847
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
7848
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
7849
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
7850
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
7841
7851
|
|
7842
|
-
const int
|
7843
|
-
const int
|
7844
|
-
const int
|
7845
|
-
const int
|
7852
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
7853
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
7854
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
7855
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
7856
|
+
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
7846
7857
|
|
7847
7858
|
cpy_blck(cx + x_offset, cdst + dst_offset);
|
7848
7859
|
}
|
@@ -8247,7 +8258,8 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
|
|
8247
8258
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
8248
8259
|
}
|
8249
8260
|
|
8250
|
-
|
8261
|
+
template <typename T>
|
8262
|
+
static void im2col_kernel(const float *x, T *dst, int offset_delta,
|
8251
8263
|
int IW, int IH, int OW, int KW, int KH,
|
8252
8264
|
int pelements, int CHW, int s0, int s1, int p0,
|
8253
8265
|
int p1, int d0, int d1,
|
@@ -10598,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
10598
10610
|
|
10599
10611
|
static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
10600
10612
|
const int ne00, const int ne01,
|
10601
|
-
const int
|
10602
|
-
const int
|
10603
|
-
const int
|
10604
|
-
const int
|
10613
|
+
const int ne02, const int nb00,
|
10614
|
+
const int nb01, const int nb02,
|
10615
|
+
const int nb03, const int ne10,
|
10616
|
+
const int ne11, const int ne12,
|
10617
|
+
const int nb10, const int nb11,
|
10618
|
+
const int nb12, const int nb13,
|
10605
10619
|
dpct::queue_ptr stream) {
|
10606
10620
|
|
10607
10621
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10614,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
|
10614
10628
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10615
10629
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10616
10630
|
[=](sycl::nd_item<3> item_ct1) {
|
10617
|
-
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10618
|
-
|
10631
|
+
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10632
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10619
10633
|
item_ct1);
|
10620
10634
|
});
|
10621
10635
|
}
|
@@ -10623,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
|
10623
10637
|
|
10624
10638
|
static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
10625
10639
|
const int ne00, const int ne01,
|
10626
|
-
const int
|
10627
|
-
const int
|
10628
|
-
const int
|
10629
|
-
const int
|
10640
|
+
const int ne02, const int nb00,
|
10641
|
+
const int nb01, const int nb02,
|
10642
|
+
const int nb03, const int ne10,
|
10643
|
+
const int ne11, const int ne12,
|
10644
|
+
const int nb10, const int nb11,
|
10645
|
+
const int nb12, const int nb13,
|
10630
10646
|
dpct::queue_ptr stream) {
|
10631
10647
|
|
10632
10648
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10639,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10639
10655
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10640
10656
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10641
10657
|
[=](sycl::nd_item<3> item_ct1) {
|
10642
|
-
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10643
|
-
|
10658
|
+
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10659
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10644
10660
|
item_ct1);
|
10645
10661
|
});
|
10646
10662
|
}
|
@@ -10648,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10648
10664
|
|
10649
10665
|
static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
10650
10666
|
const int ne00, const int ne01,
|
10651
|
-
const int
|
10652
|
-
const int
|
10653
|
-
const int
|
10654
|
-
const int
|
10667
|
+
const int ne02, const int nb00,
|
10668
|
+
const int nb01, const int nb02,
|
10669
|
+
const int nb03, const int ne10,
|
10670
|
+
const int ne11, const int ne12,
|
10671
|
+
const int nb10, const int nb11,
|
10672
|
+
const int nb12, const int nb13,
|
10655
10673
|
dpct::queue_ptr stream) {
|
10656
10674
|
|
10657
10675
|
GGML_ASSERT(ne % QK8_0 == 0);
|
@@ -10660,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
|
10660
10678
|
sycl::range<3>(1, 1, 1)),
|
10661
10679
|
[=](sycl::nd_item<3> item_ct1) {
|
10662
10680
|
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
|
10663
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10664
|
-
ne10, ne11, nb10, nb11, nb12,
|
10681
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10682
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10683
|
+
item_ct1);
|
10665
10684
|
});
|
10666
10685
|
}
|
10667
10686
|
|
10668
10687
|
static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
10669
10688
|
const int ne00, const int ne01,
|
10670
|
-
const int
|
10671
|
-
const int
|
10672
|
-
const int
|
10673
|
-
const int
|
10689
|
+
const int ne02, const int nb00,
|
10690
|
+
const int nb01, const int nb02,
|
10691
|
+
const int nb03, const int ne10,
|
10692
|
+
const int ne11, const int ne12,
|
10693
|
+
const int nb10, const int nb11,
|
10694
|
+
const int nb12, const int nb13,
|
10674
10695
|
dpct::queue_ptr stream) {
|
10675
10696
|
|
10676
10697
|
GGML_ASSERT(ne % QK4_0 == 0);
|
@@ -10679,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
|
10679
10700
|
sycl::range<3>(1, 1, 1)),
|
10680
10701
|
[=](sycl::nd_item<3> item_ct1) {
|
10681
10702
|
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
|
10682
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10683
|
-
ne10, ne11, nb10, nb11, nb12,
|
10703
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10704
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10705
|
+
item_ct1);
|
10684
10706
|
});
|
10685
10707
|
}
|
10686
10708
|
|
10687
10709
|
static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
10688
10710
|
const int ne00, const int ne01,
|
10689
|
-
const int
|
10690
|
-
const int
|
10691
|
-
const int
|
10692
|
-
const int
|
10711
|
+
const int ne02, const int nb00,
|
10712
|
+
const int nb01, const int nb02,
|
10713
|
+
const int nb03, const int ne10,
|
10714
|
+
const int ne11, const int ne12,
|
10715
|
+
const int nb10, const int nb11,
|
10716
|
+
const int nb12, const int nb13,
|
10693
10717
|
dpct::queue_ptr stream) {
|
10694
10718
|
|
10695
10719
|
GGML_ASSERT(ne % QK4_1 == 0);
|
@@ -10698,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
|
10698
10722
|
sycl::range<3>(1, 1, 1)),
|
10699
10723
|
[=](sycl::nd_item<3> item_ct1) {
|
10700
10724
|
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
|
10701
|
-
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
10702
|
-
ne10, ne11, nb10, nb11, nb12,
|
10725
|
+
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10726
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10727
|
+
item_ct1);
|
10703
10728
|
});
|
10704
10729
|
}
|
10705
10730
|
|
10706
10731
|
static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
10707
10732
|
const int ne00, const int ne01,
|
10708
|
-
const int
|
10709
|
-
const int
|
10710
|
-
const int
|
10711
|
-
const int
|
10733
|
+
const int ne02, const int nb00,
|
10734
|
+
const int nb01, const int nb02,
|
10735
|
+
const int nb03, const int ne10,
|
10736
|
+
const int ne11, const int ne12,
|
10737
|
+
const int nb10, const int nb11,
|
10738
|
+
const int nb12, const int nb13,
|
10712
10739
|
dpct::queue_ptr stream) {
|
10713
10740
|
|
10714
10741
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10721,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10721
10748
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10722
10749
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10723
10750
|
[=](sycl::nd_item<3> item_ct1) {
|
10724
|
-
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10725
|
-
|
10751
|
+
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10752
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10726
10753
|
item_ct1);
|
10727
10754
|
});
|
10728
10755
|
}
|
@@ -10730,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
|
10730
10757
|
|
10731
10758
|
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
10732
10759
|
const int ne00, const int ne01,
|
10733
|
-
const int
|
10734
|
-
const int
|
10735
|
-
const int
|
10736
|
-
const int
|
10760
|
+
const int ne02, const int nb00,
|
10761
|
+
const int nb01, const int nb02,
|
10762
|
+
const int nb03, const int ne10,
|
10763
|
+
const int ne11, const int ne12,
|
10764
|
+
const int nb10, const int nb11,
|
10765
|
+
const int nb12, const int nb13,
|
10737
10766
|
dpct::queue_ptr stream) {
|
10738
10767
|
|
10739
10768
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10746,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
|
10746
10775
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10747
10776
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10748
10777
|
[=](sycl::nd_item<3> item_ct1) {
|
10749
|
-
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10750
|
-
|
10778
|
+
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10779
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10751
10780
|
item_ct1);
|
10752
10781
|
});
|
10753
10782
|
}
|
@@ -10755,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
|
10755
10784
|
|
10756
10785
|
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
10757
10786
|
const int ne00, const int ne01,
|
10758
|
-
const int
|
10759
|
-
const int
|
10760
|
-
const int
|
10761
|
-
const int
|
10787
|
+
const int ne02, const int nb00,
|
10788
|
+
const int nb01, const int nb02,
|
10789
|
+
const int nb03, const int ne10,
|
10790
|
+
const int ne11, const int ne12,
|
10791
|
+
const int nb10, const int nb11,
|
10792
|
+
const int nb12, const int nb13,
|
10762
10793
|
dpct::queue_ptr stream) {
|
10763
10794
|
|
10764
10795
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
@@ -10771,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
|
10771
10802
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
10772
10803
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
10773
10804
|
[=](sycl::nd_item<3> item_ct1) {
|
10774
|
-
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
10775
|
-
|
10805
|
+
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
10806
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
10776
10807
|
item_ct1);
|
10777
10808
|
});
|
10778
10809
|
}
|
@@ -11019,7 +11050,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
|
11019
11050
|
});
|
11020
11051
|
}
|
11021
11052
|
|
11022
|
-
|
11053
|
+
template <typename T>
|
11054
|
+
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
11023
11055
|
int OW, int OH, int KW, int KH, int IC,
|
11024
11056
|
int offset_delta, int s0, int s1, int p0,
|
11025
11057
|
int p1, int d0, int d1,
|
@@ -11036,7 +11068,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
|
|
11036
11068
|
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
11037
11069
|
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
11038
11070
|
[=](sycl::nd_item<3> item_ct1) {
|
11039
|
-
|
11071
|
+
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
11040
11072
|
parallel_elements, (IC * KH * KW), s0, s1, p0,
|
11041
11073
|
p1, d0, d1, item_ct1);
|
11042
11074
|
});
|
@@ -11546,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11546
11578
|
}
|
11547
11579
|
char * dst_ptr = (char *) dst;
|
11548
11580
|
|
11549
|
-
|
11550
|
-
|
11551
|
-
const int64_t nb1 = src->nb[1];
|
11552
|
-
const int64_t nb2 = src->nb[2];
|
11553
|
-
const int64_t nb3 = src->nb[3];
|
11581
|
+
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
|
11582
|
+
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
|
11554
11583
|
const enum ggml_type type = src->type;
|
11555
11584
|
const int64_t ts = ggml_type_size(type);
|
11556
11585
|
const int64_t bs = ggml_blck_size(type);
|
@@ -12116,7 +12145,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12116
12145
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
12117
12146
|
const dpct::queue_ptr &stream) {
|
12118
12147
|
|
12119
|
-
|
12148
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12149
|
+
|
12120
12150
|
const int64_t row_diff = row_high - row_low;
|
12121
12151
|
|
12122
12152
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
@@ -12135,8 +12165,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12135
12165
|
} else {
|
12136
12166
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
12137
12167
|
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
|
12138
|
-
ne00, ne00,
|
12139
|
-
|
12168
|
+
ne00, ne00, ne01, ne02, nb00, nb01, nb02,
|
12169
|
+
nb03, ne10, ne11, ne12, nb10, nb11, nb12,
|
12170
|
+
nb13, stream);
|
12140
12171
|
}
|
12141
12172
|
}
|
12142
12173
|
#else
|
@@ -12392,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
12392
12423
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
12393
12424
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12394
12425
|
|
12395
|
-
|
12396
|
-
const int64_t ne01 = src0->ne[1];
|
12397
|
-
const int64_t ne02 = src0->ne[2];
|
12426
|
+
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
12398
12427
|
const int64_t nrows = ggml_nrows(src0);
|
12399
12428
|
|
12400
12429
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -12424,7 +12453,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
|
12424
12453
|
|
12425
12454
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12426
12455
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12427
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
12456
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
12428
12457
|
|
12429
12458
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12430
12459
|
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
@@ -12447,8 +12476,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
|
12447
12476
|
|
12448
12477
|
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
12449
12478
|
|
12450
|
-
|
12451
|
-
|
12479
|
+
if (dst->type == GGML_TYPE_F16) {
|
12480
|
+
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
12481
|
+
} else {
|
12482
|
+
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
12483
|
+
}
|
12452
12484
|
|
12453
12485
|
(void) src0;
|
12454
12486
|
(void) src0_dd;
|
@@ -12721,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12721
12753
|
ggml_sycl_op_mul_mat_t op,
|
12722
12754
|
const bool convert_src1_to_q8_1) try {
|
12723
12755
|
|
12724
|
-
|
12725
|
-
const int64_t ne01 = src0->ne[1];
|
12726
|
-
const int64_t ne02 = src0->ne[2];
|
12727
|
-
const int64_t ne03 = src0->ne[3];
|
12756
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
12728
12757
|
|
12729
|
-
|
12730
|
-
const int64_t ne11 = src1->ne[1];
|
12731
|
-
const int64_t ne12 = src1->ne[2];
|
12732
|
-
const int64_t ne13 = src1->ne[3];
|
12758
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
12733
12759
|
const int64_t nrows1 = ggml_nrows(src1);
|
12734
12760
|
|
12735
12761
|
GGML_ASSERT(ne03 == ne13);
|
@@ -13300,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13300
13326
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13301
13327
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13302
13328
|
|
13303
|
-
|
13304
|
-
const int64_t ne01 = src0->ne[1];
|
13305
|
-
const int64_t ne02 = src0->ne[2];
|
13306
|
-
const int64_t ne03 = src0->ne[3];
|
13329
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13307
13330
|
|
13308
|
-
|
13309
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
13310
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
13331
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
|
13311
13332
|
|
13312
|
-
|
13313
|
-
const int64_t ne11 = src1->ne[1];
|
13314
|
-
const int64_t ne12 = src1->ne[2];
|
13315
|
-
const int64_t ne13 = src1->ne[3];
|
13333
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13316
13334
|
|
13317
|
-
|
13318
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13319
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13335
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13320
13336
|
|
13321
13337
|
const int64_t ne1 = ggml_nelements(src1);
|
13322
13338
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13618,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13618
13634
|
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
13619
13635
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13620
13636
|
|
13621
|
-
|
13622
|
-
const int64_t ne01 = src00->ne[1];
|
13623
|
-
const int64_t ne02 = src00->ne[2];
|
13624
|
-
const int64_t ne03 = src00->ne[3];
|
13637
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
13625
13638
|
|
13626
13639
|
//const int64_t nb01 = src00->nb[1];
|
13627
|
-
|
13628
|
-
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
13640
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
|
13629
13641
|
|
13630
|
-
|
13631
|
-
const int64_t ne11 = src1->ne[1];
|
13632
|
-
const int64_t ne12 = src1->ne[2];
|
13633
|
-
const int64_t ne13 = src1->ne[3];
|
13642
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13634
13643
|
|
13644
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13635
13645
|
//const int64_t nb11 = src1->nb[1];
|
13636
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13637
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13638
13646
|
|
13639
13647
|
const int64_t ne1 = ggml_nelements(src1);
|
13640
13648
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13903,21 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13903
13911
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13904
13912
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
13905
13913
|
|
13906
|
-
|
13907
|
-
const int64_t ne01 = src0->ne[1];
|
13908
|
-
GGML_ASSERT(src0->ne[3] == 1);
|
13909
|
-
|
13910
|
-
const int64_t nb00 = src0->nb[0];
|
13911
|
-
const int64_t nb01 = src0->nb[1];
|
13912
|
-
const int64_t nb02 = src0->nb[2];
|
13913
|
-
|
13914
|
-
const int64_t ne10 = src1->ne[0];
|
13915
|
-
const int64_t ne11 = src1->ne[1];
|
13916
|
-
GGML_ASSERT(src1->ne[3] == 1);
|
13917
|
-
|
13918
|
-
const int64_t nb10 = src1->nb[0];
|
13919
|
-
const int64_t nb11 = src1->nb[1];
|
13920
|
-
const int64_t nb12 = src1->nb[2];
|
13914
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13921
13915
|
|
13922
13916
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13923
13917
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -13929,21 +13923,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13929
13923
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
|
13930
13924
|
|
13931
13925
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
13932
|
-
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13926
|
+
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13933
13927
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
13934
|
-
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13928
|
+
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13935
13929
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
13936
|
-
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13930
|
+
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13937
13931
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
13938
|
-
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13932
|
+
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13939
13933
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
13940
|
-
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13934
|
+
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13941
13935
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
13942
|
-
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13936
|
+
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13943
13937
|
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
13944
|
-
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13938
|
+
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13945
13939
|
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
13946
|
-
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
13940
|
+
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
13947
13941
|
} else {
|
13948
13942
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
13949
13943
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|