llama_cpp 0.10.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +691 -93
- data/ext/llama_cpp/src/ggml-metal.m +535 -54
- data/ext/llama_cpp/src/ggml-metal.metal +1497 -169
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +325 -159
- data/ext/llama_cpp/src/ggml.h +34 -13
- data/ext/llama_cpp/src/llama.cpp +195 -35
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
@@ -1,13 +1,15 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <atomic>
|
4
|
+
#include <cinttypes>
|
2
5
|
#include <cstddef>
|
3
6
|
#include <cstdint>
|
4
|
-
#include <cinttypes>
|
5
7
|
#include <float.h>
|
6
8
|
#include <limits>
|
7
9
|
#include <stdint.h>
|
8
10
|
#include <stdio.h>
|
9
|
-
#include <
|
10
|
-
|
11
|
+
#include <vector>
|
12
|
+
|
11
13
|
|
12
14
|
#if defined(GGML_USE_HIPBLAS)
|
13
15
|
#include <hip/hip_runtime.h>
|
@@ -437,6 +439,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
437
439
|
|
438
440
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
441
|
#define CUDA_SILU_BLOCK_SIZE 256
|
442
|
+
#define CUDA_TANH_BLOCK_SIZE 256
|
440
443
|
#define CUDA_RELU_BLOCK_SIZE 256
|
441
444
|
#define CUDA_SQR_BLOCK_SIZE 256
|
442
445
|
#define CUDA_CPY_BLOCK_SIZE 32
|
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
449
452
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
450
453
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
451
454
|
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
455
|
+
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
456
|
+
#define CUDA_CONCAT_BLOCK_SIZE 256
|
457
|
+
#define CUDA_PAD_BLOCK_SIZE 256
|
458
|
+
#define CUDA_ACC_BLOCK_SIZE 256
|
459
|
+
#define CUDA_IM2COL_BLOCK_SIZE 256
|
452
460
|
|
453
461
|
// dmmv = dequantize_mul_mat_vec
|
454
462
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -610,6 +618,24 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
|
|
610
618
|
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
611
619
|
}
|
612
620
|
|
621
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
622
|
+
const int ne10, const int ne11, const int ne12,
|
623
|
+
const int nb1, const int nb2, int offset) {
|
624
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
625
|
+
if (i >= ne) {
|
626
|
+
return;
|
627
|
+
}
|
628
|
+
int src1_idx = i - offset;
|
629
|
+
int oz = src1_idx / nb2;
|
630
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
631
|
+
int ox = src1_idx % nb1;
|
632
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
633
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
634
|
+
} else {
|
635
|
+
dst[i] = x[i];
|
636
|
+
}
|
637
|
+
}
|
638
|
+
|
613
639
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
614
640
|
const float GELU_COEF_A = 0.044715f;
|
615
641
|
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
@@ -632,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
632
658
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
633
659
|
}
|
634
660
|
|
661
|
+
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
662
|
+
const float GELU_QUICK_COEF = -1.702f;
|
663
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
664
|
+
if (i >= k) {
|
665
|
+
return;
|
666
|
+
}
|
667
|
+
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
668
|
+
}
|
669
|
+
|
670
|
+
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
671
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
672
|
+
if (i >= k) {
|
673
|
+
return;
|
674
|
+
}
|
675
|
+
dst[i] = tanhf(x[i]);
|
676
|
+
}
|
677
|
+
|
635
678
|
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
636
679
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
637
680
|
|
@@ -641,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
641
684
|
dst[i] = fmaxf(x[i], 0);
|
642
685
|
}
|
643
686
|
|
687
|
+
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
688
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
689
|
+
if (i >= k) {
|
690
|
+
return;
|
691
|
+
}
|
692
|
+
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
|
693
|
+
}
|
694
|
+
|
644
695
|
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
645
696
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
646
697
|
|
@@ -686,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
|
686
737
|
}
|
687
738
|
}
|
688
739
|
|
740
|
+
static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
|
741
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
742
|
+
if (nidx >= ne0) {
|
743
|
+
return;
|
744
|
+
}
|
745
|
+
// operation
|
746
|
+
int offset_dst =
|
747
|
+
nidx +
|
748
|
+
blockIdx.y * ne0 +
|
749
|
+
blockIdx.z * ne0 * gridDim.y;
|
750
|
+
if (blockIdx.z < ne02) { // src0
|
751
|
+
int offset_src =
|
752
|
+
nidx +
|
753
|
+
blockIdx.y * ne0 +
|
754
|
+
blockIdx.z * ne0 * gridDim.y;
|
755
|
+
dst[offset_dst] = x[offset_src];
|
756
|
+
} else {
|
757
|
+
int offset_src =
|
758
|
+
nidx +
|
759
|
+
blockIdx.y * ne0 +
|
760
|
+
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
761
|
+
dst[offset_dst] = y[offset_src];
|
762
|
+
}
|
763
|
+
}
|
764
|
+
|
765
|
+
static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
|
766
|
+
int ne0 = ne00 * scale_factor;
|
767
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
768
|
+
if (nidx >= ne0) {
|
769
|
+
return;
|
770
|
+
}
|
771
|
+
// operation
|
772
|
+
int i00 = nidx / scale_factor;
|
773
|
+
int i01 = blockIdx.y / scale_factor;
|
774
|
+
int offset_src =
|
775
|
+
i00 +
|
776
|
+
i01 * ne00 +
|
777
|
+
blockIdx.z * nb02;
|
778
|
+
int offset_dst =
|
779
|
+
nidx +
|
780
|
+
blockIdx.y * ne0 +
|
781
|
+
blockIdx.z * ne0 * gridDim.y;
|
782
|
+
dst[offset_dst] = x[offset_src];
|
783
|
+
}
|
784
|
+
|
785
|
+
static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
786
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
787
|
+
if (nidx >= ne0) {
|
788
|
+
return;
|
789
|
+
}
|
790
|
+
|
791
|
+
// operation
|
792
|
+
int offset_dst =
|
793
|
+
nidx +
|
794
|
+
blockIdx.y * ne0 +
|
795
|
+
blockIdx.z * ne0 * gridDim.y;
|
796
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
797
|
+
int offset_src =
|
798
|
+
nidx +
|
799
|
+
blockIdx.y * ne00 +
|
800
|
+
blockIdx.z * ne00 * ne01;
|
801
|
+
dst[offset_dst] = x[offset_src];
|
802
|
+
} else {
|
803
|
+
dst[offset_dst] = 0.0f;
|
804
|
+
}
|
805
|
+
}
|
806
|
+
|
807
|
+
template <int block_size>
|
808
|
+
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
809
|
+
int start = blockIdx.x * group_size;
|
810
|
+
int end = start + group_size;
|
811
|
+
|
812
|
+
start += threadIdx.x;
|
813
|
+
|
814
|
+
if (end >= ne_elements) {
|
815
|
+
end = ne_elements;
|
816
|
+
}
|
817
|
+
|
818
|
+
float tmp = 0.0f; // partial sum for thread in warp
|
819
|
+
|
820
|
+
for (int j = start; j < end; j += block_size) {
|
821
|
+
tmp += x[j];
|
822
|
+
}
|
823
|
+
|
824
|
+
tmp = warp_reduce_sum(tmp);
|
825
|
+
if (block_size > WARP_SIZE) {
|
826
|
+
__shared__ float s_sum[32];
|
827
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
828
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
829
|
+
if (lane_id == 0) {
|
830
|
+
s_sum[warp_id] = tmp;
|
831
|
+
}
|
832
|
+
__syncthreads();
|
833
|
+
tmp = s_sum[lane_id];
|
834
|
+
tmp = warp_reduce_sum(tmp);
|
835
|
+
}
|
836
|
+
|
837
|
+
float mean = tmp / group_size;
|
838
|
+
tmp = 0.0f;
|
839
|
+
|
840
|
+
for (int j = start; j < end; j += block_size) {
|
841
|
+
float xi = x[j] - mean;
|
842
|
+
dst[j] = xi;
|
843
|
+
tmp += xi * xi;
|
844
|
+
}
|
845
|
+
|
846
|
+
tmp = warp_reduce_sum(tmp);
|
847
|
+
if (block_size > WARP_SIZE) {
|
848
|
+
__shared__ float s_sum[32];
|
849
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
850
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
851
|
+
if (lane_id == 0) {
|
852
|
+
s_sum[warp_id] = tmp;
|
853
|
+
}
|
854
|
+
__syncthreads();
|
855
|
+
tmp = s_sum[lane_id];
|
856
|
+
tmp = warp_reduce_sum(tmp);
|
857
|
+
}
|
858
|
+
|
859
|
+
float variance = tmp / group_size;
|
860
|
+
float scale = rsqrtf(variance + eps);
|
861
|
+
for (int j = start; j < end; j += block_size) {
|
862
|
+
dst[j] *= scale;
|
863
|
+
}
|
864
|
+
}
|
865
|
+
|
689
866
|
template <int block_size>
|
690
867
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
691
868
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -1684,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1684
1861
|
}
|
1685
1862
|
|
1686
1863
|
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1687
|
-
static __global__ void k_get_rows(
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1864
|
+
static __global__ void k_get_rows(
|
1865
|
+
const void * src0, const int32_t * src1, dst_t * dst,
|
1866
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1867
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1868
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1869
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1870
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1871
|
+
|
1872
|
+
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1873
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1874
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1875
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1876
|
+
|
1877
|
+
if (i00 >= ne00) {
|
1692
1878
|
return;
|
1693
1879
|
}
|
1694
1880
|
|
1695
|
-
const int
|
1881
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1696
1882
|
|
1697
|
-
|
1698
|
-
const
|
1699
|
-
const int di = row*ncols + col;
|
1883
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1884
|
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
1700
1885
|
|
1701
|
-
const int ib =
|
1702
|
-
const int iqs = (
|
1703
|
-
const int iybs =
|
1886
|
+
const int ib = i00/qk; // block index
|
1887
|
+
const int iqs = (i00%qk)/qr; // quant index
|
1888
|
+
const int iybs = i00 - i00%qk; // dst block start index
|
1704
1889
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1705
1890
|
|
1706
1891
|
// dequantize
|
1707
1892
|
dfloat2 v;
|
1708
|
-
dequantize_kernel(
|
1893
|
+
dequantize_kernel(src0_row, ib, iqs, v);
|
1709
1894
|
|
1710
|
-
|
1711
|
-
|
1895
|
+
dst_row[iybs + iqs + 0] = v.x;
|
1896
|
+
dst_row[iybs + iqs + y_offset] = v.y;
|
1897
|
+
}
|
1898
|
+
|
1899
|
+
template<typename src0_t, typename dst_t>
|
1900
|
+
static __global__ void k_get_rows_float(
|
1901
|
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
1902
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1903
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1904
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1905
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1906
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1907
|
+
|
1908
|
+
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
1909
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1910
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1911
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1912
|
+
|
1913
|
+
if (i00 >= ne00) {
|
1914
|
+
return;
|
1915
|
+
}
|
1916
|
+
|
1917
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1918
|
+
|
1919
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1920
|
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
1921
|
+
|
1922
|
+
dst_row[i00] = src0_row[i00];
|
1712
1923
|
}
|
1713
1924
|
|
1714
1925
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
@@ -5035,29 +5246,98 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
5035
5246
|
|
5036
5247
|
static __global__ void im2col_f32_f16(
|
5037
5248
|
const float * x, half * dst,
|
5038
|
-
int
|
5249
|
+
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
5039
5250
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
5040
|
-
const int
|
5041
|
-
|
5251
|
+
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
5252
|
+
if (i >= pelements) {
|
5253
|
+
return;
|
5254
|
+
}
|
5255
|
+
|
5256
|
+
const int ksize = OW * (KH > 1 ? KW : 1);
|
5257
|
+
const int kx = i / ksize;
|
5258
|
+
const int kd = kx * ksize;
|
5259
|
+
const int ky = (i - kd) / OW;
|
5260
|
+
const int ix = i % OW;
|
5261
|
+
|
5262
|
+
const int iiw = ix * s0 + kx * d0 - p0;
|
5263
|
+
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
5042
5264
|
|
5043
5265
|
const int offset_dst =
|
5044
|
-
(
|
5045
|
-
(blockIdx.
|
5266
|
+
(blockIdx.y * OW + ix) * CHW +
|
5267
|
+
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
5046
5268
|
|
5047
5269
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
5048
5270
|
dst[offset_dst] = __float2half(0.0f);
|
5049
5271
|
} else {
|
5050
|
-
const int offset_src =
|
5272
|
+
const int offset_src = blockIdx.z * offset_delta;
|
5051
5273
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
5052
5274
|
}
|
5053
5275
|
}
|
5054
5276
|
|
5055
5277
|
template<int qk, int qr, dequantize_kernel_t dq>
|
5056
|
-
static void get_rows_cuda(const
|
5278
|
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5279
|
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5280
|
+
|
5281
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5282
|
+
|
5057
5283
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5058
|
-
const int block_num_x = (
|
5059
|
-
const dim3 block_nums(block_num_x,
|
5060
|
-
|
5284
|
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
5285
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5286
|
+
|
5287
|
+
// strides in elements
|
5288
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5289
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5290
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5291
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5292
|
+
|
5293
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5294
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5295
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5296
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5297
|
+
|
5298
|
+
GGML_ASSERT(ne00 % 2 == 0);
|
5299
|
+
|
5300
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
5301
|
+
src0_dd, src1_dd, dst_dd,
|
5302
|
+
ne00, /*ne01, ne02, ne03,*/
|
5303
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5304
|
+
/* s0,*/ s1, s2, s3,
|
5305
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5306
|
+
s10, s11, s12/*, s13*/);
|
5307
|
+
|
5308
|
+
(void) dst;
|
5309
|
+
}
|
5310
|
+
|
5311
|
+
template<typename src0_t>
|
5312
|
+
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5313
|
+
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5314
|
+
|
5315
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5316
|
+
|
5317
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5318
|
+
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
5319
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5320
|
+
|
5321
|
+
// strides in elements
|
5322
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5323
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5324
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5325
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5326
|
+
|
5327
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5328
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5329
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5330
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5331
|
+
|
5332
|
+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
5333
|
+
src0_dd, src1_dd, dst_dd,
|
5334
|
+
ne00, /*ne01, ne02, ne03,*/
|
5335
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5336
|
+
/* s0,*/ s1, s2, s3,
|
5337
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5338
|
+
s10, s11, s12/*, s13*/);
|
5339
|
+
|
5340
|
+
(void) dst;
|
5061
5341
|
}
|
5062
5342
|
|
5063
5343
|
template<float (*bin_op)(const float, const float)>
|
@@ -5069,7 +5349,6 @@ struct bin_bcast_cuda {
|
|
5069
5349
|
|
5070
5350
|
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
5351
|
|
5072
|
-
|
5073
5352
|
int nr0 = ne10/ne0;
|
5074
5353
|
int nr1 = ne11/ne1;
|
5075
5354
|
int nr2 = ne12/ne2;
|
@@ -5117,26 +5396,28 @@ struct bin_bcast_cuda {
|
|
5117
5396
|
int64_t ne12 = cne1[2];
|
5118
5397
|
int64_t ne13 = cne1[3];
|
5119
5398
|
|
5120
|
-
|
5399
|
+
size_t nb0 = cnb0[0];
|
5121
5400
|
size_t nb1 = cnb0[1];
|
5122
5401
|
size_t nb2 = cnb0[2];
|
5123
5402
|
size_t nb3 = cnb0[3];
|
5124
5403
|
|
5125
|
-
|
5404
|
+
size_t nb10 = cnb1[0];
|
5126
5405
|
size_t nb11 = cnb1[1];
|
5127
5406
|
size_t nb12 = cnb1[2];
|
5128
5407
|
size_t nb13 = cnb1[3];
|
5129
5408
|
|
5130
|
-
|
5131
|
-
size_t s1 = nb1 / sizeof(
|
5132
|
-
size_t s2 = nb2 / sizeof(
|
5133
|
-
size_t s3 = nb3 / sizeof(
|
5409
|
+
size_t s0 = nb0 / sizeof(dst_t);
|
5410
|
+
size_t s1 = nb1 / sizeof(dst_t);
|
5411
|
+
size_t s2 = nb2 / sizeof(dst_t);
|
5412
|
+
size_t s3 = nb3 / sizeof(dst_t);
|
5134
5413
|
|
5135
|
-
|
5414
|
+
size_t s10 = nb10 / sizeof(src1_t);
|
5136
5415
|
size_t s11 = nb11 / sizeof(src1_t);
|
5137
5416
|
size_t s12 = nb12 / sizeof(src1_t);
|
5138
5417
|
size_t s13 = nb13 / sizeof(src1_t);
|
5139
5418
|
|
5419
|
+
GGML_ASSERT(s0 == 1);
|
5420
|
+
GGML_ASSERT(s10 == 1);
|
5140
5421
|
|
5141
5422
|
const int block_size = 128;
|
5142
5423
|
|
@@ -5174,6 +5455,13 @@ struct bin_bcast_cuda {
|
|
5174
5455
|
}
|
5175
5456
|
};
|
5176
5457
|
|
5458
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
5459
|
+
const int ne10, const int ne11, const int ne12,
|
5460
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
5461
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
5462
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
5463
|
+
}
|
5464
|
+
|
5177
5465
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5178
5466
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5179
5467
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -5184,11 +5472,26 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
5184
5472
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5185
5473
|
}
|
5186
5474
|
|
5475
|
+
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5476
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5477
|
+
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5478
|
+
}
|
5479
|
+
|
5480
|
+
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5481
|
+
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
5482
|
+
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5483
|
+
}
|
5484
|
+
|
5187
5485
|
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5188
5486
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5189
5487
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5190
5488
|
}
|
5191
5489
|
|
5490
|
+
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
5491
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5492
|
+
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
5493
|
+
}
|
5494
|
+
|
5192
5495
|
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5193
5496
|
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
5194
5497
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -5205,6 +5508,38 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
5205
5508
|
}
|
5206
5509
|
}
|
5207
5510
|
|
5511
|
+
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
5512
|
+
static const float eps = 1e-6f;
|
5513
|
+
if (group_size < 1024) {
|
5514
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
5515
|
+
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
5516
|
+
} else {
|
5517
|
+
const dim3 block_dims(1024, 1, 1);
|
5518
|
+
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
5519
|
+
}
|
5520
|
+
}
|
5521
|
+
|
5522
|
+
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
5523
|
+
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
5524
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5525
|
+
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
5526
|
+
}
|
5527
|
+
|
5528
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
|
5529
|
+
int ne0 = (ne00 * scale_factor);
|
5530
|
+
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
5531
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
5532
|
+
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
static void pad_f32_cuda(const float * x, float * dst,
|
5536
|
+
const int ne00, const int ne01, const int ne02,
|
5537
|
+
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
5538
|
+
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
5539
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5540
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
5541
|
+
}
|
5542
|
+
|
5208
5543
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
5209
5544
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
5210
5545
|
if (ncols < 1024) {
|
@@ -6167,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
6167
6502
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
6168
6503
|
}
|
6169
6504
|
|
6170
|
-
static void im2col_f32_f16_cuda(const float
|
6171
|
-
int
|
6172
|
-
int
|
6173
|
-
int s0,
|
6174
|
-
|
6175
|
-
|
6176
|
-
|
6505
|
+
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
6506
|
+
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
6507
|
+
int offset_delta,
|
6508
|
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
6509
|
+
const int parallel_elements = OW * KW * KH;
|
6510
|
+
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
6511
|
+
dim3 block_nums(num_blocks, OH, IC);
|
6512
|
+
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
6177
6513
|
}
|
6178
6514
|
|
6179
6515
|
// buffer pool for cuda
|
@@ -6447,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
|
|
6447
6783
|
|
6448
6784
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6449
6785
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
6450
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
6451
|
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
6452
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
6453
6786
|
|
6454
|
-
|
6455
|
-
|
6787
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
6788
|
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
6789
|
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
6456
6790
|
|
6457
6791
|
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
6458
6792
|
|
6459
6793
|
switch (src0->type) {
|
6460
6794
|
case GGML_TYPE_F16:
|
6461
|
-
|
6795
|
+
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
6462
6796
|
break;
|
6463
6797
|
case GGML_TYPE_F32:
|
6464
|
-
|
6798
|
+
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6465
6799
|
break;
|
6466
6800
|
case GGML_TYPE_Q4_0:
|
6467
|
-
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(
|
6801
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6468
6802
|
break;
|
6469
6803
|
case GGML_TYPE_Q4_1:
|
6470
|
-
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(
|
6804
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6471
6805
|
break;
|
6472
6806
|
case GGML_TYPE_Q5_0:
|
6473
|
-
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(
|
6807
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6474
6808
|
break;
|
6475
6809
|
case GGML_TYPE_Q5_1:
|
6476
|
-
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(
|
6810
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6477
6811
|
break;
|
6478
6812
|
case GGML_TYPE_Q8_0:
|
6479
|
-
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(
|
6813
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6480
6814
|
break;
|
6481
6815
|
default:
|
6482
6816
|
// TODO: k-quants
|
@@ -6522,6 +6856,25 @@ inline void ggml_cuda_op_add(
|
|
6522
6856
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
6857
|
}
|
6524
6858
|
|
6859
|
+
inline void ggml_cuda_op_acc(
|
6860
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6861
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6862
|
+
|
6863
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6864
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6865
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6866
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
6867
|
+
|
6868
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
6869
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
6870
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
6871
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
6872
|
+
|
6873
|
+
acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
6874
|
+
|
6875
|
+
(void) dst;
|
6876
|
+
}
|
6877
|
+
|
6525
6878
|
inline void ggml_cuda_op_mul(
|
6526
6879
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
6880
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6564,6 +6917,34 @@ inline void ggml_cuda_op_silu(
|
|
6564
6917
|
(void) src1_dd;
|
6565
6918
|
}
|
6566
6919
|
|
6920
|
+
inline void ggml_cuda_op_gelu_quick(
|
6921
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6922
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6923
|
+
|
6924
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6925
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6926
|
+
|
6927
|
+
gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6928
|
+
|
6929
|
+
(void) src1;
|
6930
|
+
(void) dst;
|
6931
|
+
(void) src1_dd;
|
6932
|
+
}
|
6933
|
+
|
6934
|
+
inline void ggml_cuda_op_tanh(
|
6935
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6936
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6937
|
+
|
6938
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6939
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6940
|
+
|
6941
|
+
tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6942
|
+
|
6943
|
+
(void) src1;
|
6944
|
+
(void) dst;
|
6945
|
+
(void) src1_dd;
|
6946
|
+
}
|
6947
|
+
|
6567
6948
|
inline void ggml_cuda_op_relu(
|
6568
6949
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6569
6950
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6578,6 +6959,23 @@ inline void ggml_cuda_op_relu(
|
|
6578
6959
|
(void) src1_dd;
|
6579
6960
|
}
|
6580
6961
|
|
6962
|
+
inline void ggml_cuda_op_leaky_relu(
|
6963
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6964
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6965
|
+
|
6966
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6967
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6968
|
+
|
6969
|
+
float negative_slope;
|
6970
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
6971
|
+
|
6972
|
+
leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
6973
|
+
|
6974
|
+
(void) src1;
|
6975
|
+
(void) dst;
|
6976
|
+
(void) src1_dd;
|
6977
|
+
}
|
6978
|
+
|
6581
6979
|
inline void ggml_cuda_op_sqr(
|
6582
6980
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6583
6981
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6612,6 +7010,71 @@ inline void ggml_cuda_op_norm(
|
|
6612
7010
|
(void) src1_dd;
|
6613
7011
|
}
|
6614
7012
|
|
7013
|
+
|
7014
|
+
inline void ggml_cuda_op_group_norm(
|
7015
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7016
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7017
|
+
|
7018
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7019
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7020
|
+
|
7021
|
+
int num_groups = dst->op_params[0];
|
7022
|
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
7023
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
7024
|
+
|
7025
|
+
(void) src1;
|
7026
|
+
(void) dst;
|
7027
|
+
(void) src1_dd;
|
7028
|
+
}
|
7029
|
+
|
7030
|
+
inline void ggml_cuda_op_concat(
|
7031
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7032
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7033
|
+
|
7034
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7035
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7036
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7037
|
+
|
7038
|
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
7039
|
+
concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
7040
|
+
}
|
7041
|
+
|
7042
|
+
(void) src1;
|
7043
|
+
(void) dst;
|
7044
|
+
}
|
7045
|
+
|
7046
|
+
inline void ggml_cuda_op_upscale(
|
7047
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7048
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7049
|
+
|
7050
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7051
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7052
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7053
|
+
|
7054
|
+
const int scale_factor = dst->op_params[0];
|
7055
|
+
|
7056
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
7057
|
+
|
7058
|
+
(void) src1;
|
7059
|
+
(void) dst;
|
7060
|
+
}
|
7061
|
+
|
7062
|
+
inline void ggml_cuda_op_pad(
|
7063
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7064
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7065
|
+
|
7066
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7067
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7068
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7069
|
+
|
7070
|
+
pad_f32_cuda(src0_dd, dst_dd,
|
7071
|
+
src0->ne[0], src0->ne[1], src0->ne[2],
|
7072
|
+
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
7073
|
+
|
7074
|
+
(void) src1;
|
7075
|
+
(void) dst;
|
7076
|
+
}
|
7077
|
+
|
6615
7078
|
inline void ggml_cuda_op_rms_norm(
|
6616
7079
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6617
7080
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7126,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
|
|
7126
7589
|
|
7127
7590
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
7128
7591
|
|
7129
|
-
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
7130
7592
|
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
7131
7593
|
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
7132
7594
|
const int64_t IW = src1->ne[0];
|
@@ -7137,17 +7599,15 @@ inline void ggml_cuda_op_im2col(
|
|
7137
7599
|
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
7138
7600
|
const int64_t OW = dst->ne[1];
|
7139
7601
|
|
7140
|
-
const size_t
|
7141
|
-
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7602
|
+
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7142
7603
|
|
7143
|
-
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
7144
|
-
OH, IW, IH, OW, IC, KH, KW, N,
|
7145
|
-
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
7604
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
7146
7605
|
|
7147
7606
|
(void) src0;
|
7148
7607
|
(void) src0_dd;
|
7149
7608
|
}
|
7150
7609
|
|
7610
|
+
|
7151
7611
|
inline void ggml_cuda_op_sum_rows(
|
7152
7612
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
7613
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7696,6 +8156,10 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7696
8156
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
7697
8157
|
}
|
7698
8158
|
|
8159
|
+
static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8160
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
|
8161
|
+
}
|
8162
|
+
|
7699
8163
|
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7700
8164
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7701
8165
|
}
|
@@ -7712,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7712
8176
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7713
8177
|
}
|
7714
8178
|
|
8179
|
+
static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8180
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
|
8181
|
+
}
|
8182
|
+
|
8183
|
+
static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8184
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
|
8185
|
+
}
|
8186
|
+
|
7715
8187
|
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7716
8188
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7717
8189
|
}
|
7718
8190
|
|
8191
|
+
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8192
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
8193
|
+
}
|
8194
|
+
|
7719
8195
|
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7720
8196
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7721
8197
|
}
|
@@ -7724,6 +8200,22 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7724
8200
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7725
8201
|
}
|
7726
8202
|
|
8203
|
+
static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8204
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
|
8205
|
+
}
|
8206
|
+
|
8207
|
+
static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8208
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
|
8209
|
+
}
|
8210
|
+
|
8211
|
+
static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8212
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
|
8213
|
+
}
|
8214
|
+
|
8215
|
+
static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8216
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
8217
|
+
}
|
8218
|
+
|
7727
8219
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7728
8220
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
7729
8221
|
}
|
@@ -8234,36 +8726,69 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
8234
8726
|
}
|
8235
8727
|
#endif
|
8236
8728
|
|
8237
|
-
static void ggml_cuda_mul_mat_id(const ggml_tensor *
|
8729
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8238
8730
|
#if 0
|
8239
|
-
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
-
// const bool use_tensor_cores = true;
|
8241
|
-
//#else
|
8242
|
-
// const bool use_tensor_cores = false;
|
8243
|
-
//#endif
|
8244
|
-
|
8245
8731
|
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
-
|
8247
8732
|
// TODO: mmq/mmv support
|
8248
|
-
#
|
8249
|
-
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
-
const int id = dst->op_params[0];
|
8733
|
+
#endif
|
8252
8734
|
|
8253
|
-
|
8735
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8254
8736
|
|
8255
|
-
|
8256
|
-
|
8257
|
-
|
8737
|
+
const struct ggml_tensor * ids = src0;
|
8738
|
+
const int32_t id = ((int32_t *) dst->op_params)[0];
|
8739
|
+
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
8258
8740
|
|
8259
|
-
|
8260
|
-
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8741
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
8261
8742
|
|
8262
|
-
|
8263
|
-
|
8743
|
+
if (ids->backend == GGML_BACKEND_GPU) {
|
8744
|
+
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8745
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8746
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8747
|
+
} else {
|
8748
|
+
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8749
|
+
}
|
8750
|
+
|
8751
|
+
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
8752
|
+
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
8753
|
+
|
8754
|
+
ggml_tensor_extra_gpu src1_row_extra;
|
8755
|
+
ggml_tensor_extra_gpu dst_row_extra;
|
8756
|
+
|
8757
|
+
ggml_tensor src1_row = *src1;
|
8758
|
+
ggml_tensor dst_row = *dst;
|
8759
|
+
|
8760
|
+
src1_row.ne[1] = 1;
|
8761
|
+
dst_row.ne[1] = 1;
|
8762
|
+
|
8763
|
+
src1_row.nb[2] = src1_row.nb[1];
|
8764
|
+
dst_row.nb[2] = dst_row.nb[1];
|
8765
|
+
|
8766
|
+
src1_row.nb[3] = src1_row.nb[1];
|
8767
|
+
dst_row.nb[3] = dst_row.nb[1];
|
8768
|
+
|
8769
|
+
src1_row.extra = &src1_row_extra;
|
8770
|
+
dst_row.extra = &dst_row_extra;
|
8771
|
+
|
8772
|
+
|
8773
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8774
|
+
//int32_t row_id;
|
8775
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8776
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8777
|
+
|
8778
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8779
|
+
|
8780
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8781
|
+
|
8782
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8783
|
+
|
8784
|
+
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
8785
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
8264
8786
|
|
8265
|
-
|
8266
|
-
|
8787
|
+
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
8788
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1];
|
8789
|
+
|
8790
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8791
|
+
}
|
8267
8792
|
}
|
8268
8793
|
|
8269
8794
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8373,6 +8898,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
8373
8898
|
(void) dst;
|
8374
8899
|
}
|
8375
8900
|
|
8901
|
+
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
8902
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
8903
|
+
|
8904
|
+
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
8905
|
+
}
|
8906
|
+
|
8376
8907
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
8377
8908
|
const int64_t nrows = ggml_nrows(tensor);
|
8378
8909
|
|
@@ -8422,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8422
8953
|
|
8423
8954
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
8424
8955
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8425
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8426
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8956
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
8427
8957
|
}
|
8428
8958
|
|
8429
8959
|
char * buf;
|
@@ -8683,6 +9213,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8683
9213
|
case GGML_OP_ADD:
|
8684
9214
|
func = ggml_cuda_add;
|
8685
9215
|
break;
|
9216
|
+
case GGML_OP_ACC:
|
9217
|
+
func = ggml_cuda_acc;
|
9218
|
+
break;
|
8686
9219
|
case GGML_OP_MUL:
|
8687
9220
|
func = ggml_cuda_mul;
|
8688
9221
|
break;
|
@@ -8697,6 +9230,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8697
9230
|
case GGML_UNARY_OP_SILU:
|
8698
9231
|
func = ggml_cuda_silu;
|
8699
9232
|
break;
|
9233
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9234
|
+
func = ggml_cuda_gelu_quick;
|
9235
|
+
break;
|
9236
|
+
case GGML_UNARY_OP_TANH:
|
9237
|
+
func = ggml_cuda_tanh;
|
9238
|
+
break;
|
8700
9239
|
case GGML_UNARY_OP_RELU:
|
8701
9240
|
func = ggml_cuda_relu;
|
8702
9241
|
break;
|
@@ -8707,6 +9246,21 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8707
9246
|
case GGML_OP_NORM:
|
8708
9247
|
func = ggml_cuda_norm;
|
8709
9248
|
break;
|
9249
|
+
case GGML_OP_GROUP_NORM:
|
9250
|
+
func = ggml_cuda_group_norm;
|
9251
|
+
break;
|
9252
|
+
case GGML_OP_CONCAT:
|
9253
|
+
func = ggml_cuda_concat;
|
9254
|
+
break;
|
9255
|
+
case GGML_OP_UPSCALE:
|
9256
|
+
func = ggml_cuda_upscale;
|
9257
|
+
break;
|
9258
|
+
case GGML_OP_PAD:
|
9259
|
+
func = ggml_cuda_pad;
|
9260
|
+
break;
|
9261
|
+
case GGML_OP_LEAKY_RELU:
|
9262
|
+
func = ggml_cuda_leaky_relu;
|
9263
|
+
break;
|
8710
9264
|
case GGML_OP_RMS_NORM:
|
8711
9265
|
func = ggml_cuda_rms_norm;
|
8712
9266
|
break;
|
@@ -8729,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8729
9283
|
func = ggml_cuda_sqr;
|
8730
9284
|
break;
|
8731
9285
|
case GGML_OP_CLAMP:
|
8732
|
-
if (!any_on_device) {
|
8733
|
-
return false;
|
8734
|
-
}
|
8735
9286
|
func = ggml_cuda_clamp;
|
8736
9287
|
break;
|
8737
9288
|
case GGML_OP_CPY:
|
@@ -8740,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8740
9291
|
case GGML_OP_CONT:
|
8741
9292
|
func = ggml_cuda_dup;
|
8742
9293
|
break;
|
9294
|
+
case GGML_OP_NONE:
|
8743
9295
|
case GGML_OP_RESHAPE:
|
8744
9296
|
case GGML_OP_VIEW:
|
8745
9297
|
case GGML_OP_PERMUTE:
|
@@ -8938,8 +9490,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
8938
9490
|
|
8939
9491
|
if (ggml_is_quantized(tensor->type)) {
|
8940
9492
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
9493
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
8943
9494
|
}
|
8944
9495
|
}
|
8945
9496
|
|
@@ -9159,6 +9710,8 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9159
9710
|
case GGML_UNARY_OP_GELU:
|
9160
9711
|
case GGML_UNARY_OP_SILU:
|
9161
9712
|
case GGML_UNARY_OP_RELU:
|
9713
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9714
|
+
case GGML_UNARY_OP_TANH:
|
9162
9715
|
return true;
|
9163
9716
|
default:
|
9164
9717
|
return false;
|
@@ -9181,6 +9734,45 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9181
9734
|
}
|
9182
9735
|
return true;
|
9183
9736
|
} break;
|
9737
|
+
case GGML_OP_GET_ROWS:
|
9738
|
+
{
|
9739
|
+
switch (op->src[0]->type) {
|
9740
|
+
case GGML_TYPE_F16:
|
9741
|
+
case GGML_TYPE_F32:
|
9742
|
+
case GGML_TYPE_Q4_0:
|
9743
|
+
case GGML_TYPE_Q4_1:
|
9744
|
+
case GGML_TYPE_Q5_0:
|
9745
|
+
case GGML_TYPE_Q5_1:
|
9746
|
+
case GGML_TYPE_Q8_0:
|
9747
|
+
return true;
|
9748
|
+
default:
|
9749
|
+
return false;
|
9750
|
+
}
|
9751
|
+
} break;
|
9752
|
+
case GGML_OP_CPY:
|
9753
|
+
{
|
9754
|
+
ggml_type src0_type = op->src[0]->type;
|
9755
|
+
ggml_type src1_type = op->src[1]->type;
|
9756
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
9757
|
+
return true;
|
9758
|
+
}
|
9759
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
9760
|
+
return true;
|
9761
|
+
}
|
9762
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
9763
|
+
return true;
|
9764
|
+
}
|
9765
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
9766
|
+
return true;
|
9767
|
+
}
|
9768
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
9769
|
+
return true;
|
9770
|
+
}
|
9771
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
9772
|
+
return true;
|
9773
|
+
}
|
9774
|
+
return false;
|
9775
|
+
} break;
|
9184
9776
|
case GGML_OP_NONE:
|
9185
9777
|
case GGML_OP_RESHAPE:
|
9186
9778
|
case GGML_OP_VIEW:
|
@@ -9188,7 +9780,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9188
9780
|
case GGML_OP_TRANSPOSE:
|
9189
9781
|
case GGML_OP_NORM:
|
9190
9782
|
case GGML_OP_REPEAT:
|
9191
|
-
case GGML_OP_GET_ROWS:
|
9192
9783
|
case GGML_OP_DUP:
|
9193
9784
|
case GGML_OP_ADD:
|
9194
9785
|
case GGML_OP_MUL:
|
@@ -9197,7 +9788,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9197
9788
|
case GGML_OP_SCALE:
|
9198
9789
|
case GGML_OP_SQR:
|
9199
9790
|
case GGML_OP_CLAMP:
|
9200
|
-
case GGML_OP_CPY:
|
9201
9791
|
case GGML_OP_CONT:
|
9202
9792
|
case GGML_OP_DIAG_MASK_INF:
|
9203
9793
|
case GGML_OP_SOFT_MAX:
|
@@ -9206,6 +9796,12 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9206
9796
|
case GGML_OP_IM2COL:
|
9207
9797
|
case GGML_OP_SUM_ROWS:
|
9208
9798
|
case GGML_OP_ARGSORT:
|
9799
|
+
case GGML_OP_ACC:
|
9800
|
+
case GGML_OP_CONCAT:
|
9801
|
+
case GGML_OP_GROUP_NORM:
|
9802
|
+
case GGML_OP_UPSCALE:
|
9803
|
+
case GGML_OP_PAD:
|
9804
|
+
case GGML_OP_LEAKY_RELU:
|
9209
9805
|
return true;
|
9210
9806
|
default:
|
9211
9807
|
return false;
|
@@ -9264,7 +9860,9 @@ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * use
|
|
9264
9860
|
UNUSED(params);
|
9265
9861
|
}
|
9266
9862
|
|
9267
|
-
extern "C" int ggml_backend_cuda_reg_devices()
|
9863
|
+
extern "C" int ggml_backend_cuda_reg_devices();
|
9864
|
+
|
9865
|
+
int ggml_backend_cuda_reg_devices() {
|
9268
9866
|
int device_count = ggml_cuda_get_device_count();
|
9269
9867
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
9868
|
for (int i = 0; i < device_count; i++) {
|