llama_cpp 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +18 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +952 -232
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +725 -98
- data/ext/llama_cpp/src/ggml-metal.metal +1508 -171
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +554 -215
- data/ext/llama_cpp/src/ggml.h +58 -23
- data/ext/llama_cpp/src/llama.cpp +1157 -851
- data/ext/llama_cpp/src/llama.h +9 -4
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
@@ -1,13 +1,15 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <atomic>
|
4
|
+
#include <cinttypes>
|
2
5
|
#include <cstddef>
|
3
6
|
#include <cstdint>
|
4
|
-
#include <cinttypes>
|
5
7
|
#include <float.h>
|
6
8
|
#include <limits>
|
7
9
|
#include <stdint.h>
|
8
10
|
#include <stdio.h>
|
9
|
-
#include <
|
10
|
-
|
11
|
+
#include <vector>
|
12
|
+
|
11
13
|
|
12
14
|
#if defined(GGML_USE_HIPBLAS)
|
13
15
|
#include <hip/hip_runtime.h>
|
@@ -29,6 +31,7 @@
|
|
29
31
|
#define CUDA_R_16F HIPBLAS_R_16F
|
30
32
|
#define CUDA_R_32F HIPBLAS_R_32F
|
31
33
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
34
|
+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
32
35
|
#define cublasCreate hipblasCreate
|
33
36
|
#define cublasGemmEx hipblasGemmEx
|
34
37
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
@@ -38,6 +41,7 @@
|
|
38
41
|
#define cublasSetStream hipblasSetStream
|
39
42
|
#define cublasSgemm hipblasSgemm
|
40
43
|
#define cublasStatus_t hipblasStatus_t
|
44
|
+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
41
45
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
42
46
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
43
47
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
@@ -56,8 +60,13 @@
|
|
56
60
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
57
61
|
#define cudaGetErrorString hipGetErrorString
|
58
62
|
#define cudaGetLastError hipGetLastError
|
63
|
+
#ifdef GGML_HIP_UMA
|
64
|
+
#define cudaMalloc hipMallocManaged
|
65
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
66
|
+
#else
|
59
67
|
#define cudaMalloc hipMalloc
|
60
68
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
69
|
+
#endif
|
61
70
|
#define cudaMemcpy hipMemcpy
|
62
71
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
63
72
|
#define cudaMemcpyAsync hipMemcpyAsync
|
@@ -76,6 +85,7 @@
|
|
76
85
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
77
86
|
#define cudaStream_t hipStream_t
|
78
87
|
#define cudaSuccess hipSuccess
|
88
|
+
#define __trap abort
|
79
89
|
#else
|
80
90
|
#include <cuda_runtime.h>
|
81
91
|
#include <cublas_v2.h>
|
@@ -437,6 +447,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
437
447
|
|
438
448
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
449
|
#define CUDA_SILU_BLOCK_SIZE 256
|
450
|
+
#define CUDA_TANH_BLOCK_SIZE 256
|
440
451
|
#define CUDA_RELU_BLOCK_SIZE 256
|
441
452
|
#define CUDA_SQR_BLOCK_SIZE 256
|
442
453
|
#define CUDA_CPY_BLOCK_SIZE 32
|
@@ -449,6 +460,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
449
460
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
450
461
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
451
462
|
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
463
|
+
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
464
|
+
#define CUDA_CONCAT_BLOCK_SIZE 256
|
465
|
+
#define CUDA_PAD_BLOCK_SIZE 256
|
466
|
+
#define CUDA_ACC_BLOCK_SIZE 256
|
467
|
+
#define CUDA_IM2COL_BLOCK_SIZE 256
|
452
468
|
|
453
469
|
// dmmv = dequantize_mul_mat_vec
|
454
470
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -502,6 +518,14 @@ static size_t g_scratch_offset = 0;
|
|
502
518
|
|
503
519
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
504
520
|
|
521
|
+
[[noreturn]]
|
522
|
+
static __device__ void bad_arch() {
|
523
|
+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
524
|
+
__trap();
|
525
|
+
|
526
|
+
(void) bad_arch; // suppress unused function warning
|
527
|
+
}
|
528
|
+
|
505
529
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
530
|
#pragma unroll
|
507
531
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -610,6 +634,24 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
|
|
610
634
|
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
611
635
|
}
|
612
636
|
|
637
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
638
|
+
const int ne10, const int ne11, const int ne12,
|
639
|
+
const int nb1, const int nb2, int offset) {
|
640
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
641
|
+
if (i >= ne) {
|
642
|
+
return;
|
643
|
+
}
|
644
|
+
int src1_idx = i - offset;
|
645
|
+
int oz = src1_idx / nb2;
|
646
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
647
|
+
int ox = src1_idx % nb1;
|
648
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
649
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
650
|
+
} else {
|
651
|
+
dst[i] = x[i];
|
652
|
+
}
|
653
|
+
}
|
654
|
+
|
613
655
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
614
656
|
const float GELU_COEF_A = 0.044715f;
|
615
657
|
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
@@ -632,6 +674,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
632
674
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
633
675
|
}
|
634
676
|
|
677
|
+
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
678
|
+
const float GELU_QUICK_COEF = -1.702f;
|
679
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
680
|
+
if (i >= k) {
|
681
|
+
return;
|
682
|
+
}
|
683
|
+
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
684
|
+
}
|
685
|
+
|
686
|
+
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
687
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
688
|
+
if (i >= k) {
|
689
|
+
return;
|
690
|
+
}
|
691
|
+
dst[i] = tanhf(x[i]);
|
692
|
+
}
|
693
|
+
|
635
694
|
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
636
695
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
637
696
|
|
@@ -641,6 +700,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
641
700
|
dst[i] = fmaxf(x[i], 0);
|
642
701
|
}
|
643
702
|
|
703
|
+
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
704
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
705
|
+
if (i >= k) {
|
706
|
+
return;
|
707
|
+
}
|
708
|
+
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
|
709
|
+
}
|
710
|
+
|
644
711
|
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
645
712
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
646
713
|
|
@@ -686,6 +753,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
|
686
753
|
}
|
687
754
|
}
|
688
755
|
|
756
|
+
static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
|
757
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
758
|
+
if (nidx >= ne0) {
|
759
|
+
return;
|
760
|
+
}
|
761
|
+
// operation
|
762
|
+
int offset_dst =
|
763
|
+
nidx +
|
764
|
+
blockIdx.y * ne0 +
|
765
|
+
blockIdx.z * ne0 * gridDim.y;
|
766
|
+
if (blockIdx.z < ne02) { // src0
|
767
|
+
int offset_src =
|
768
|
+
nidx +
|
769
|
+
blockIdx.y * ne0 +
|
770
|
+
blockIdx.z * ne0 * gridDim.y;
|
771
|
+
dst[offset_dst] = x[offset_src];
|
772
|
+
} else {
|
773
|
+
int offset_src =
|
774
|
+
nidx +
|
775
|
+
blockIdx.y * ne0 +
|
776
|
+
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
777
|
+
dst[offset_dst] = y[offset_src];
|
778
|
+
}
|
779
|
+
}
|
780
|
+
|
781
|
+
static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
|
782
|
+
int ne0 = ne00 * scale_factor;
|
783
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
784
|
+
if (nidx >= ne0) {
|
785
|
+
return;
|
786
|
+
}
|
787
|
+
// operation
|
788
|
+
int i00 = nidx / scale_factor;
|
789
|
+
int i01 = blockIdx.y / scale_factor;
|
790
|
+
int offset_src =
|
791
|
+
i00 +
|
792
|
+
i01 * ne00 +
|
793
|
+
blockIdx.z * nb02;
|
794
|
+
int offset_dst =
|
795
|
+
nidx +
|
796
|
+
blockIdx.y * ne0 +
|
797
|
+
blockIdx.z * ne0 * gridDim.y;
|
798
|
+
dst[offset_dst] = x[offset_src];
|
799
|
+
}
|
800
|
+
|
801
|
+
static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
802
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
803
|
+
if (nidx >= ne0) {
|
804
|
+
return;
|
805
|
+
}
|
806
|
+
|
807
|
+
// operation
|
808
|
+
int offset_dst =
|
809
|
+
nidx +
|
810
|
+
blockIdx.y * ne0 +
|
811
|
+
blockIdx.z * ne0 * gridDim.y;
|
812
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
813
|
+
int offset_src =
|
814
|
+
nidx +
|
815
|
+
blockIdx.y * ne00 +
|
816
|
+
blockIdx.z * ne00 * ne01;
|
817
|
+
dst[offset_dst] = x[offset_src];
|
818
|
+
} else {
|
819
|
+
dst[offset_dst] = 0.0f;
|
820
|
+
}
|
821
|
+
}
|
822
|
+
|
823
|
+
template <int block_size>
|
824
|
+
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
825
|
+
int start = blockIdx.x * group_size;
|
826
|
+
int end = start + group_size;
|
827
|
+
|
828
|
+
start += threadIdx.x;
|
829
|
+
|
830
|
+
if (end >= ne_elements) {
|
831
|
+
end = ne_elements;
|
832
|
+
}
|
833
|
+
|
834
|
+
float tmp = 0.0f; // partial sum for thread in warp
|
835
|
+
|
836
|
+
for (int j = start; j < end; j += block_size) {
|
837
|
+
tmp += x[j];
|
838
|
+
}
|
839
|
+
|
840
|
+
tmp = warp_reduce_sum(tmp);
|
841
|
+
if (block_size > WARP_SIZE) {
|
842
|
+
__shared__ float s_sum[32];
|
843
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
844
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
845
|
+
if (lane_id == 0) {
|
846
|
+
s_sum[warp_id] = tmp;
|
847
|
+
}
|
848
|
+
__syncthreads();
|
849
|
+
tmp = s_sum[lane_id];
|
850
|
+
tmp = warp_reduce_sum(tmp);
|
851
|
+
}
|
852
|
+
|
853
|
+
float mean = tmp / group_size;
|
854
|
+
tmp = 0.0f;
|
855
|
+
|
856
|
+
for (int j = start; j < end; j += block_size) {
|
857
|
+
float xi = x[j] - mean;
|
858
|
+
dst[j] = xi;
|
859
|
+
tmp += xi * xi;
|
860
|
+
}
|
861
|
+
|
862
|
+
tmp = warp_reduce_sum(tmp);
|
863
|
+
if (block_size > WARP_SIZE) {
|
864
|
+
__shared__ float s_sum[32];
|
865
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
866
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
867
|
+
if (lane_id == 0) {
|
868
|
+
s_sum[warp_id] = tmp;
|
869
|
+
}
|
870
|
+
__syncthreads();
|
871
|
+
tmp = s_sum[lane_id];
|
872
|
+
tmp = warp_reduce_sum(tmp);
|
873
|
+
}
|
874
|
+
|
875
|
+
float variance = tmp / group_size;
|
876
|
+
float scale = rsqrtf(variance + eps);
|
877
|
+
for (int j = start; j < end; j += block_size) {
|
878
|
+
dst[j] *= scale;
|
879
|
+
}
|
880
|
+
}
|
881
|
+
|
689
882
|
template <int block_size>
|
690
883
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
691
884
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -1684,31 +1877,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1684
1877
|
}
|
1685
1878
|
|
1686
1879
|
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1687
|
-
static __global__ void k_get_rows(
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1880
|
+
static __global__ void k_get_rows(
|
1881
|
+
const void * src0, const int32_t * src1, dst_t * dst,
|
1882
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1883
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1884
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1885
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1886
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1887
|
+
|
1888
|
+
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1889
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1890
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1891
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1892
|
+
|
1893
|
+
if (i00 >= ne00) {
|
1692
1894
|
return;
|
1693
1895
|
}
|
1694
1896
|
|
1695
|
-
const int
|
1897
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1696
1898
|
|
1697
|
-
|
1698
|
-
const
|
1699
|
-
const int di = row*ncols + col;
|
1899
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1900
|
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
1700
1901
|
|
1701
|
-
const int ib =
|
1702
|
-
const int iqs = (
|
1703
|
-
const int iybs =
|
1902
|
+
const int ib = i00/qk; // block index
|
1903
|
+
const int iqs = (i00%qk)/qr; // quant index
|
1904
|
+
const int iybs = i00 - i00%qk; // dst block start index
|
1704
1905
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1705
1906
|
|
1706
1907
|
// dequantize
|
1707
1908
|
dfloat2 v;
|
1708
|
-
dequantize_kernel(
|
1909
|
+
dequantize_kernel(src0_row, ib, iqs, v);
|
1709
1910
|
|
1710
|
-
|
1711
|
-
|
1911
|
+
dst_row[iybs + iqs + 0] = v.x;
|
1912
|
+
dst_row[iybs + iqs + y_offset] = v.y;
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
template<typename src0_t, typename dst_t>
|
1916
|
+
static __global__ void k_get_rows_float(
|
1917
|
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
1918
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1919
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1920
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1921
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1922
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1923
|
+
|
1924
|
+
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
1925
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1926
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1927
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1928
|
+
|
1929
|
+
if (i00 >= ne00) {
|
1930
|
+
return;
|
1931
|
+
}
|
1932
|
+
|
1933
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1934
|
+
|
1935
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1936
|
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
1937
|
+
|
1938
|
+
dst_row[i00] = src0_row[i00];
|
1712
1939
|
}
|
1713
1940
|
|
1714
1941
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
@@ -1759,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1759
1986
|
// second part effectively subtracts 8 from each quant value
|
1760
1987
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1761
1988
|
#else
|
1762
|
-
|
1763
|
-
return 0.0f; // only to satisfy the compiler
|
1989
|
+
bad_arch();
|
1764
1990
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1765
1991
|
}
|
1766
1992
|
|
@@ -1797,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1797
2023
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1798
2024
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1799
2025
|
#else
|
1800
|
-
|
1801
|
-
return 0.0f; // only to satisfy the compiler
|
2026
|
+
bad_arch();
|
1802
2027
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1803
2028
|
}
|
1804
2029
|
|
@@ -1833,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1833
2058
|
// second part effectively subtracts 16 from each quant value
|
1834
2059
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1835
2060
|
#else
|
1836
|
-
|
1837
|
-
return 0.0f; // only to satisfy the compiler
|
2061
|
+
bad_arch();
|
1838
2062
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1839
2063
|
}
|
1840
2064
|
|
@@ -1879,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1879
2103
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1880
2104
|
|
1881
2105
|
#else
|
1882
|
-
|
1883
|
-
return 0.0f; // only to satisfy the compiler
|
2106
|
+
bad_arch();
|
1884
2107
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1885
2108
|
}
|
1886
2109
|
|
@@ -1901,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
1901
2124
|
|
1902
2125
|
return d8_0*d8_1 * sumi;
|
1903
2126
|
#else
|
1904
|
-
|
1905
|
-
return 0.0f; // only to satisfy the compiler
|
2127
|
+
bad_arch();
|
1906
2128
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1907
2129
|
}
|
1908
2130
|
|
@@ -1932,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1932
2154
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1933
2155
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1934
2156
|
#else
|
1935
|
-
|
1936
|
-
return 0.0f; // only to satisfy the compiler
|
2157
|
+
bad_arch();
|
1937
2158
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1938
2159
|
}
|
1939
2160
|
|
@@ -1968,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
1968
2189
|
|
1969
2190
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1970
2191
|
#else
|
1971
|
-
|
1972
|
-
return 0.0f; // only to satisfy the compiler
|
2192
|
+
bad_arch();
|
1973
2193
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1974
2194
|
}
|
1975
2195
|
|
@@ -2006,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
2006
2226
|
|
2007
2227
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
2008
2228
|
#else
|
2009
|
-
|
2010
|
-
return 0.0f; // only to satisfy the compiler
|
2229
|
+
bad_arch();
|
2011
2230
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2012
2231
|
}
|
2013
2232
|
|
@@ -2047,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
2047
2266
|
|
2048
2267
|
return d3 * sumf;
|
2049
2268
|
#else
|
2050
|
-
|
2051
|
-
return 0.0f; // only to satisfy the compiler
|
2269
|
+
bad_arch();
|
2052
2270
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2053
2271
|
}
|
2054
2272
|
|
@@ -2073,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
2073
2291
|
|
2074
2292
|
return d3*d8 * sumi;
|
2075
2293
|
#else
|
2076
|
-
|
2077
|
-
return 0.0f; // only to satisfy the compiler
|
2294
|
+
bad_arch();
|
2078
2295
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2079
2296
|
}
|
2080
2297
|
|
@@ -2107,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
2107
2324
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2108
2325
|
|
2109
2326
|
#else
|
2110
|
-
|
2111
|
-
return 0.0f; // only to satisfy the compiler
|
2327
|
+
bad_arch();
|
2112
2328
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2113
2329
|
}
|
2114
2330
|
|
@@ -2141,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
2141
2357
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2142
2358
|
|
2143
2359
|
#else
|
2144
|
-
|
2145
|
-
return 0.0f; // only to satisfy the compiler
|
2360
|
+
bad_arch();
|
2146
2361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2147
2362
|
}
|
2148
2363
|
|
@@ -2182,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
2182
2397
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
2183
2398
|
|
2184
2399
|
#else
|
2185
|
-
|
2186
|
-
return 0.0f; // only to satisfy the compiler
|
2400
|
+
bad_arch();
|
2187
2401
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2188
2402
|
}
|
2189
2403
|
|
@@ -2216,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
2216
2430
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2217
2431
|
|
2218
2432
|
#else
|
2219
|
-
|
2220
|
-
return 0.0f; // only to satisfy the compiler
|
2433
|
+
bad_arch();
|
2221
2434
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2222
2435
|
}
|
2223
2436
|
|
@@ -2247,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
2247
2460
|
|
2248
2461
|
return d*sumf;
|
2249
2462
|
#else
|
2250
|
-
|
2251
|
-
return 0.0f; // only to satisfy the compiler
|
2463
|
+
bad_arch();
|
2252
2464
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2253
2465
|
}
|
2254
2466
|
|
@@ -2279,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
2279
2491
|
return d6 * sumf_d;
|
2280
2492
|
|
2281
2493
|
#else
|
2282
|
-
|
2283
|
-
return 0.0f; // only to satisfy the compiler
|
2494
|
+
bad_arch();
|
2284
2495
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2285
2496
|
}
|
2286
2497
|
|
@@ -3146,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3146
3357
|
return dall * sumf_d - dmin * sumf_m;
|
3147
3358
|
|
3148
3359
|
#else
|
3149
|
-
|
3150
|
-
return 0.0f; // only to satisfy the compiler
|
3360
|
+
bad_arch();
|
3151
3361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3152
3362
|
|
3153
3363
|
#endif
|
@@ -3330,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3330
3540
|
return d * sumf_d;
|
3331
3541
|
|
3332
3542
|
#else
|
3333
|
-
|
3334
|
-
return 0.0f; // only to satisfy the compiler
|
3543
|
+
bad_arch();
|
3335
3544
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3336
3545
|
|
3337
3546
|
#endif
|
@@ -3741,7 +3950,7 @@ template <bool need_check> static __global__ void
|
|
3741
3950
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3742
3951
|
#else
|
3743
3952
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3744
|
-
|
3953
|
+
bad_arch();
|
3745
3954
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3746
3955
|
}
|
3747
3956
|
|
@@ -3810,7 +4019,7 @@ template <bool need_check> static __global__ void
|
|
3810
4019
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3811
4020
|
#else
|
3812
4021
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3813
|
-
|
4022
|
+
bad_arch();
|
3814
4023
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3815
4024
|
}
|
3816
4025
|
|
@@ -3877,7 +4086,7 @@ template <bool need_check> static __global__ void
|
|
3877
4086
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
4087
|
#else
|
3879
4088
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3880
|
-
|
4089
|
+
bad_arch();
|
3881
4090
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3882
4091
|
}
|
3883
4092
|
|
@@ -3944,7 +4153,7 @@ mul_mat_q5_1(
|
|
3944
4153
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3945
4154
|
#else
|
3946
4155
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3947
|
-
|
4156
|
+
bad_arch();
|
3948
4157
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3949
4158
|
}
|
3950
4159
|
|
@@ -4011,7 +4220,7 @@ template <bool need_check> static __global__ void
|
|
4011
4220
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4012
4221
|
#else
|
4013
4222
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
4014
|
-
|
4223
|
+
bad_arch();
|
4015
4224
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4016
4225
|
}
|
4017
4226
|
|
@@ -4078,7 +4287,7 @@ mul_mat_q2_K(
|
|
4078
4287
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4079
4288
|
#else
|
4080
4289
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
4081
|
-
|
4290
|
+
bad_arch();
|
4082
4291
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4083
4292
|
}
|
4084
4293
|
|
@@ -4147,7 +4356,7 @@ template <bool need_check> static __global__ void
|
|
4147
4356
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4148
4357
|
#else
|
4149
4358
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
4150
|
-
|
4359
|
+
bad_arch();
|
4151
4360
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4152
4361
|
}
|
4153
4362
|
|
@@ -4216,7 +4425,7 @@ template <bool need_check> static __global__ void
|
|
4216
4425
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4217
4426
|
#else
|
4218
4427
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4219
|
-
|
4428
|
+
bad_arch();
|
4220
4429
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4221
4430
|
}
|
4222
4431
|
|
@@ -4283,7 +4492,7 @@ mul_mat_q5_K(
|
|
4283
4492
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
4493
|
#else
|
4285
4494
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4286
|
-
|
4495
|
+
bad_arch();
|
4287
4496
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4288
4497
|
}
|
4289
4498
|
|
@@ -4352,7 +4561,7 @@ template <bool need_check> static __global__ void
|
|
4352
4561
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4353
4562
|
#else
|
4354
4563
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4355
|
-
|
4564
|
+
bad_arch();
|
4356
4565
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4357
4566
|
}
|
4358
4567
|
|
@@ -4787,7 +4996,16 @@ static __global__ void rope_neox(
|
|
4787
4996
|
const int ib = col / n_dims;
|
4788
4997
|
const int ic = col % n_dims;
|
4789
4998
|
|
4790
|
-
|
4999
|
+
if (ib > 0) {
|
5000
|
+
const int i = row*ncols + ib*n_dims + ic;
|
5001
|
+
|
5002
|
+
dst[i + 0] = x[i + 0];
|
5003
|
+
dst[i + 1] = x[i + 1];
|
5004
|
+
|
5005
|
+
return;
|
5006
|
+
}
|
5007
|
+
|
5008
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4791
5009
|
const int i2 = row/p_delta_rows;
|
4792
5010
|
|
4793
5011
|
float cur_rot = inv_ndims * ic - ib;
|
@@ -5035,29 +5253,98 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
5035
5253
|
|
5036
5254
|
static __global__ void im2col_f32_f16(
|
5037
5255
|
const float * x, half * dst,
|
5038
|
-
int
|
5256
|
+
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
5039
5257
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
5040
|
-
const int
|
5041
|
-
|
5258
|
+
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
5259
|
+
if (i >= pelements) {
|
5260
|
+
return;
|
5261
|
+
}
|
5262
|
+
|
5263
|
+
const int ksize = OW * (KH > 1 ? KW : 1);
|
5264
|
+
const int kx = i / ksize;
|
5265
|
+
const int kd = kx * ksize;
|
5266
|
+
const int ky = (i - kd) / OW;
|
5267
|
+
const int ix = i % OW;
|
5268
|
+
|
5269
|
+
const int iiw = ix * s0 + kx * d0 - p0;
|
5270
|
+
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
5042
5271
|
|
5043
5272
|
const int offset_dst =
|
5044
|
-
(
|
5045
|
-
(blockIdx.
|
5273
|
+
(blockIdx.y * OW + ix) * CHW +
|
5274
|
+
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
5046
5275
|
|
5047
5276
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
5048
5277
|
dst[offset_dst] = __float2half(0.0f);
|
5049
5278
|
} else {
|
5050
|
-
const int offset_src =
|
5279
|
+
const int offset_src = blockIdx.z * offset_delta;
|
5051
5280
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
5052
5281
|
}
|
5053
5282
|
}
|
5054
5283
|
|
5055
5284
|
template<int qk, int qr, dequantize_kernel_t dq>
|
5056
|
-
static void get_rows_cuda(const
|
5285
|
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5286
|
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5287
|
+
|
5288
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5289
|
+
|
5057
5290
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5058
|
-
const int block_num_x = (
|
5059
|
-
const dim3 block_nums(block_num_x,
|
5060
|
-
|
5291
|
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
5292
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5293
|
+
|
5294
|
+
// strides in elements
|
5295
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5296
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5297
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5298
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5299
|
+
|
5300
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5301
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5302
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5303
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5304
|
+
|
5305
|
+
GGML_ASSERT(ne00 % 2 == 0);
|
5306
|
+
|
5307
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
5308
|
+
src0_dd, src1_dd, dst_dd,
|
5309
|
+
ne00, /*ne01, ne02, ne03,*/
|
5310
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5311
|
+
/* s0,*/ s1, s2, s3,
|
5312
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5313
|
+
s10, s11, s12/*, s13*/);
|
5314
|
+
|
5315
|
+
(void) dst;
|
5316
|
+
}
|
5317
|
+
|
5318
|
+
template<typename src0_t>
|
5319
|
+
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5320
|
+
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5321
|
+
|
5322
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5323
|
+
|
5324
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5325
|
+
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
5326
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5327
|
+
|
5328
|
+
// strides in elements
|
5329
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5330
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5331
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5332
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5333
|
+
|
5334
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5335
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5336
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5337
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5338
|
+
|
5339
|
+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
5340
|
+
src0_dd, src1_dd, dst_dd,
|
5341
|
+
ne00, /*ne01, ne02, ne03,*/
|
5342
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5343
|
+
/* s0,*/ s1, s2, s3,
|
5344
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5345
|
+
s10, s11, s12/*, s13*/);
|
5346
|
+
|
5347
|
+
(void) dst;
|
5061
5348
|
}
|
5062
5349
|
|
5063
5350
|
template<float (*bin_op)(const float, const float)>
|
@@ -5069,7 +5356,6 @@ struct bin_bcast_cuda {
|
|
5069
5356
|
|
5070
5357
|
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
5358
|
|
5072
|
-
|
5073
5359
|
int nr0 = ne10/ne0;
|
5074
5360
|
int nr1 = ne11/ne1;
|
5075
5361
|
int nr2 = ne12/ne2;
|
@@ -5117,26 +5403,28 @@ struct bin_bcast_cuda {
|
|
5117
5403
|
int64_t ne12 = cne1[2];
|
5118
5404
|
int64_t ne13 = cne1[3];
|
5119
5405
|
|
5120
|
-
|
5406
|
+
size_t nb0 = cnb0[0];
|
5121
5407
|
size_t nb1 = cnb0[1];
|
5122
5408
|
size_t nb2 = cnb0[2];
|
5123
5409
|
size_t nb3 = cnb0[3];
|
5124
5410
|
|
5125
|
-
|
5411
|
+
size_t nb10 = cnb1[0];
|
5126
5412
|
size_t nb11 = cnb1[1];
|
5127
5413
|
size_t nb12 = cnb1[2];
|
5128
5414
|
size_t nb13 = cnb1[3];
|
5129
5415
|
|
5130
|
-
|
5131
|
-
size_t s1 = nb1 / sizeof(
|
5132
|
-
size_t s2 = nb2 / sizeof(
|
5133
|
-
size_t s3 = nb3 / sizeof(
|
5416
|
+
size_t s0 = nb0 / sizeof(dst_t);
|
5417
|
+
size_t s1 = nb1 / sizeof(dst_t);
|
5418
|
+
size_t s2 = nb2 / sizeof(dst_t);
|
5419
|
+
size_t s3 = nb3 / sizeof(dst_t);
|
5134
5420
|
|
5135
|
-
|
5421
|
+
size_t s10 = nb10 / sizeof(src1_t);
|
5136
5422
|
size_t s11 = nb11 / sizeof(src1_t);
|
5137
5423
|
size_t s12 = nb12 / sizeof(src1_t);
|
5138
5424
|
size_t s13 = nb13 / sizeof(src1_t);
|
5139
5425
|
|
5426
|
+
GGML_ASSERT(s0 == 1);
|
5427
|
+
GGML_ASSERT(s10 == 1);
|
5140
5428
|
|
5141
5429
|
const int block_size = 128;
|
5142
5430
|
|
@@ -5174,6 +5462,13 @@ struct bin_bcast_cuda {
|
|
5174
5462
|
}
|
5175
5463
|
};
|
5176
5464
|
|
5465
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
5466
|
+
const int ne10, const int ne11, const int ne12,
|
5467
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
5468
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
5469
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
5470
|
+
}
|
5471
|
+
|
5177
5472
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5178
5473
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5179
5474
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -5184,11 +5479,26 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
5184
5479
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5185
5480
|
}
|
5186
5481
|
|
5482
|
+
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5483
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5484
|
+
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5485
|
+
}
|
5486
|
+
|
5487
|
+
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5488
|
+
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
5489
|
+
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5490
|
+
}
|
5491
|
+
|
5187
5492
|
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5188
5493
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5189
5494
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5190
5495
|
}
|
5191
5496
|
|
5497
|
+
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
5498
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5499
|
+
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
5500
|
+
}
|
5501
|
+
|
5192
5502
|
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5193
5503
|
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
5194
5504
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -5205,6 +5515,38 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
5205
5515
|
}
|
5206
5516
|
}
|
5207
5517
|
|
5518
|
+
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
5519
|
+
static const float eps = 1e-6f;
|
5520
|
+
if (group_size < 1024) {
|
5521
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
5522
|
+
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
5523
|
+
} else {
|
5524
|
+
const dim3 block_dims(1024, 1, 1);
|
5525
|
+
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
5526
|
+
}
|
5527
|
+
}
|
5528
|
+
|
5529
|
+
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
5530
|
+
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
5531
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5532
|
+
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
|
5536
|
+
int ne0 = (ne00 * scale_factor);
|
5537
|
+
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
5538
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
5539
|
+
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
5540
|
+
}
|
5541
|
+
|
5542
|
+
static void pad_f32_cuda(const float * x, float * dst,
|
5543
|
+
const int ne00, const int ne01, const int ne02,
|
5544
|
+
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
5545
|
+
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
5546
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5547
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
5548
|
+
}
|
5549
|
+
|
5208
5550
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
5209
5551
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
5210
5552
|
if (ncols < 1024) {
|
@@ -6167,13 +6509,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
6167
6509
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
6168
6510
|
}
|
6169
6511
|
|
6170
|
-
static void im2col_f32_f16_cuda(const float
|
6171
|
-
int
|
6172
|
-
int
|
6173
|
-
int s0,
|
6174
|
-
|
6175
|
-
|
6176
|
-
|
6512
|
+
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
6513
|
+
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
6514
|
+
int offset_delta,
|
6515
|
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
6516
|
+
const int parallel_elements = OW * KW * KH;
|
6517
|
+
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
6518
|
+
dim3 block_nums(num_blocks, OH, IC);
|
6519
|
+
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
6177
6520
|
}
|
6178
6521
|
|
6179
6522
|
// buffer pool for cuda
|
@@ -6447,39 +6790,38 @@ static void ggml_cuda_op_get_rows(
|
|
6447
6790
|
|
6448
6791
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6449
6792
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
6450
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
6451
|
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
6452
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
6453
6793
|
|
6454
|
-
|
6455
|
-
|
6794
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
6795
|
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
6796
|
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
6456
6797
|
|
6457
6798
|
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
6458
6799
|
|
6459
6800
|
switch (src0->type) {
|
6460
6801
|
case GGML_TYPE_F16:
|
6461
|
-
|
6802
|
+
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
6462
6803
|
break;
|
6463
6804
|
case GGML_TYPE_F32:
|
6464
|
-
|
6805
|
+
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6465
6806
|
break;
|
6466
6807
|
case GGML_TYPE_Q4_0:
|
6467
|
-
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(
|
6808
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6468
6809
|
break;
|
6469
6810
|
case GGML_TYPE_Q4_1:
|
6470
|
-
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(
|
6811
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6471
6812
|
break;
|
6472
6813
|
case GGML_TYPE_Q5_0:
|
6473
|
-
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(
|
6814
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6474
6815
|
break;
|
6475
6816
|
case GGML_TYPE_Q5_1:
|
6476
|
-
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(
|
6817
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6477
6818
|
break;
|
6478
6819
|
case GGML_TYPE_Q8_0:
|
6479
|
-
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(
|
6820
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6480
6821
|
break;
|
6481
6822
|
default:
|
6482
6823
|
// TODO: k-quants
|
6824
|
+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
6483
6825
|
GGML_ASSERT(false);
|
6484
6826
|
break;
|
6485
6827
|
}
|
@@ -6522,6 +6864,25 @@ inline void ggml_cuda_op_add(
|
|
6522
6864
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
6865
|
}
|
6524
6866
|
|
6867
|
+
inline void ggml_cuda_op_acc(
|
6868
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6869
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6870
|
+
|
6871
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6872
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6873
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6874
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
6875
|
+
|
6876
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
6877
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
6878
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
6879
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
6880
|
+
|
6881
|
+
acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
6882
|
+
|
6883
|
+
(void) dst;
|
6884
|
+
}
|
6885
|
+
|
6525
6886
|
inline void ggml_cuda_op_mul(
|
6526
6887
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
6888
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6564,6 +6925,34 @@ inline void ggml_cuda_op_silu(
|
|
6564
6925
|
(void) src1_dd;
|
6565
6926
|
}
|
6566
6927
|
|
6928
|
+
inline void ggml_cuda_op_gelu_quick(
|
6929
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6930
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6931
|
+
|
6932
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6933
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6934
|
+
|
6935
|
+
gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6936
|
+
|
6937
|
+
(void) src1;
|
6938
|
+
(void) dst;
|
6939
|
+
(void) src1_dd;
|
6940
|
+
}
|
6941
|
+
|
6942
|
+
inline void ggml_cuda_op_tanh(
|
6943
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6944
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6945
|
+
|
6946
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6947
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6948
|
+
|
6949
|
+
tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6950
|
+
|
6951
|
+
(void) src1;
|
6952
|
+
(void) dst;
|
6953
|
+
(void) src1_dd;
|
6954
|
+
}
|
6955
|
+
|
6567
6956
|
inline void ggml_cuda_op_relu(
|
6568
6957
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6569
6958
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6578,6 +6967,23 @@ inline void ggml_cuda_op_relu(
|
|
6578
6967
|
(void) src1_dd;
|
6579
6968
|
}
|
6580
6969
|
|
6970
|
+
inline void ggml_cuda_op_leaky_relu(
|
6971
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6972
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6973
|
+
|
6974
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6975
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6976
|
+
|
6977
|
+
float negative_slope;
|
6978
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
6979
|
+
|
6980
|
+
leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
6981
|
+
|
6982
|
+
(void) src1;
|
6983
|
+
(void) dst;
|
6984
|
+
(void) src1_dd;
|
6985
|
+
}
|
6986
|
+
|
6581
6987
|
inline void ggml_cuda_op_sqr(
|
6582
6988
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6583
6989
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6612,6 +7018,73 @@ inline void ggml_cuda_op_norm(
|
|
6612
7018
|
(void) src1_dd;
|
6613
7019
|
}
|
6614
7020
|
|
7021
|
+
|
7022
|
+
inline void ggml_cuda_op_group_norm(
|
7023
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7024
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7025
|
+
|
7026
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7027
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7028
|
+
|
7029
|
+
int num_groups = dst->op_params[0];
|
7030
|
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
7031
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
7032
|
+
|
7033
|
+
(void) src1;
|
7034
|
+
(void) dst;
|
7035
|
+
(void) src1_dd;
|
7036
|
+
}
|
7037
|
+
|
7038
|
+
inline void ggml_cuda_op_concat(
|
7039
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7040
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7041
|
+
|
7042
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7043
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7044
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7045
|
+
|
7046
|
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
7047
|
+
concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
7048
|
+
}
|
7049
|
+
|
7050
|
+
(void) src1;
|
7051
|
+
(void) dst;
|
7052
|
+
}
|
7053
|
+
|
7054
|
+
inline void ggml_cuda_op_upscale(
|
7055
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7056
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7057
|
+
|
7058
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7059
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7060
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7061
|
+
|
7062
|
+
const int scale_factor = dst->op_params[0];
|
7063
|
+
|
7064
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
7065
|
+
|
7066
|
+
(void) src1;
|
7067
|
+
(void) dst;
|
7068
|
+
(void) src1_dd;
|
7069
|
+
}
|
7070
|
+
|
7071
|
+
inline void ggml_cuda_op_pad(
|
7072
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7073
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7074
|
+
|
7075
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7076
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7077
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7078
|
+
|
7079
|
+
pad_f32_cuda(src0_dd, dst_dd,
|
7080
|
+
src0->ne[0], src0->ne[1], src0->ne[2],
|
7081
|
+
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
7082
|
+
|
7083
|
+
(void) src1;
|
7084
|
+
(void) dst;
|
7085
|
+
(void) src1_dd;
|
7086
|
+
}
|
7087
|
+
|
6615
7088
|
inline void ggml_cuda_op_rms_norm(
|
6616
7089
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6617
7090
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6913,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6913
7386
|
|
6914
7387
|
const int compute_capability = g_compute_capabilities[id];
|
6915
7388
|
|
6916
|
-
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
7389
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
6917
7390
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
6918
7391
|
half * src0_as_f16 = nullptr;
|
6919
7392
|
size_t src0_as = 0;
|
@@ -7126,7 +7599,6 @@ inline void ggml_cuda_op_im2col(
|
|
7126
7599
|
|
7127
7600
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
7128
7601
|
|
7129
|
-
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
7130
7602
|
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
7131
7603
|
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
7132
7604
|
const int64_t IW = src1->ne[0];
|
@@ -7137,17 +7609,15 @@ inline void ggml_cuda_op_im2col(
|
|
7137
7609
|
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
7138
7610
|
const int64_t OW = dst->ne[1];
|
7139
7611
|
|
7140
|
-
const size_t
|
7141
|
-
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7612
|
+
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7142
7613
|
|
7143
|
-
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
7144
|
-
OH, IW, IH, OW, IC, KH, KW, N,
|
7145
|
-
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
7614
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
7146
7615
|
|
7147
7616
|
(void) src0;
|
7148
7617
|
(void) src0_dd;
|
7149
7618
|
}
|
7150
7619
|
|
7620
|
+
|
7151
7621
|
inline void ggml_cuda_op_sum_rows(
|
7152
7622
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
7623
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7230,17 +7700,9 @@ inline void ggml_cuda_op_scale(
|
|
7230
7700
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7231
7701
|
|
7232
7702
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7233
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7234
7703
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7235
7704
|
|
7236
|
-
float scale;
|
7237
|
-
// HACK: support for ggml backend interface
|
7238
|
-
if (src1->backend == GGML_BACKEND_CPU) {
|
7239
|
-
scale = ((float *) src1->data)[0];
|
7240
|
-
} else {
|
7241
|
-
// TODO: pass pointer to kernel instead of copying to host
|
7242
|
-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
7243
|
-
}
|
7705
|
+
const float scale = ((float *) dst->op_params)[0];
|
7244
7706
|
|
7245
7707
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
7246
7708
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7287,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7287
7749
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
7288
7750
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
7289
7751
|
|
7290
|
-
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
7291
|
-
|
7292
7752
|
// dd = data device
|
7293
7753
|
float * src0_ddf = nullptr;
|
7294
7754
|
float * src1_ddf = nullptr;
|
@@ -7309,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7309
7769
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
7310
7770
|
}
|
7311
7771
|
|
7312
|
-
if (use_src1
|
7772
|
+
if (use_src1) {
|
7313
7773
|
if (src1_on_device) {
|
7314
7774
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7315
7775
|
} else {
|
@@ -7357,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
7357
7817
|
}
|
7358
7818
|
|
7359
7819
|
#ifdef NDEBUG
|
7820
|
+
for (int id = 0; id < g_device_count; ++id) {
|
7821
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
7822
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
7823
|
+
}
|
7824
|
+
|
7360
7825
|
for (int id = 0; id < g_device_count; ++id) {
|
7361
7826
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7362
7827
|
|
@@ -7408,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
|
|
7408
7873
|
const int nb2 = dst->nb[2];
|
7409
7874
|
const int nb3 = dst->nb[3];
|
7410
7875
|
|
7411
|
-
ggml_cuda_set_peer_access(ne11);
|
7412
|
-
|
7413
7876
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
7414
7877
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
7415
7878
|
|
@@ -7696,6 +8159,10 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7696
8159
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
7697
8160
|
}
|
7698
8161
|
|
8162
|
+
static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8163
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
|
8164
|
+
}
|
8165
|
+
|
7699
8166
|
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7700
8167
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7701
8168
|
}
|
@@ -7712,10 +8179,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7712
8179
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7713
8180
|
}
|
7714
8181
|
|
8182
|
+
static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8183
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
|
8184
|
+
}
|
8185
|
+
|
8186
|
+
static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8187
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
|
8188
|
+
}
|
8189
|
+
|
7715
8190
|
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7716
8191
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7717
8192
|
}
|
7718
8193
|
|
8194
|
+
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8195
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
8196
|
+
}
|
8197
|
+
|
7719
8198
|
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7720
8199
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7721
8200
|
}
|
@@ -7724,6 +8203,22 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7724
8203
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7725
8204
|
}
|
7726
8205
|
|
8206
|
+
static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8207
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
|
8208
|
+
}
|
8209
|
+
|
8210
|
+
static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8211
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
|
8212
|
+
}
|
8213
|
+
|
8214
|
+
static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8215
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
|
8216
|
+
}
|
8217
|
+
|
8218
|
+
static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8219
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
8220
|
+
}
|
8221
|
+
|
7727
8222
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7728
8223
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
7729
8224
|
}
|
@@ -7808,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7808
8303
|
}
|
7809
8304
|
|
7810
8305
|
static __global__ void k_compute_batched_ptrs(
|
7811
|
-
const half * src0_as_f16, const half * src1_as_f16,
|
8306
|
+
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
7812
8307
|
const void ** ptrs_src, void ** ptrs_dst,
|
7813
|
-
|
7814
|
-
|
7815
|
-
|
7816
|
-
|
7817
|
-
|
7818
|
-
|
7819
|
-
|
7820
|
-
|
8308
|
+
int64_t ne12, int64_t ne13,
|
8309
|
+
int64_t ne23,
|
8310
|
+
size_t nb02, size_t nb03,
|
8311
|
+
size_t nb12, size_t nb13,
|
8312
|
+
size_t nbd2, size_t nbd3,
|
8313
|
+
int64_t r2, int64_t r3) {
|
8314
|
+
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8315
|
+
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7821
8316
|
|
7822
8317
|
if (i13 >= ne13 || i12 >= ne12) {
|
7823
8318
|
return;
|
7824
8319
|
}
|
7825
8320
|
|
7826
|
-
|
7827
|
-
|
8321
|
+
int64_t i03 = i13 / r3;
|
8322
|
+
int64_t i02 = i12 / r2;
|
7828
8323
|
|
7829
8324
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7830
8325
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7831
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *)
|
8326
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
7832
8327
|
}
|
7833
8328
|
|
7834
8329
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -7884,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7884
8379
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7885
8380
|
|
7886
8381
|
size_t dst_as = 0;
|
7887
|
-
|
8382
|
+
|
8383
|
+
half * dst_f16 = nullptr;
|
8384
|
+
char * dst_t = nullptr;
|
8385
|
+
|
8386
|
+
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
8387
|
+
cudaDataType_t cu_data_type = CUDA_R_16F;
|
8388
|
+
|
8389
|
+
// dst strides
|
8390
|
+
size_t nbd2 = dst->nb[2];
|
8391
|
+
size_t nbd3 = dst->nb[3];
|
8392
|
+
|
8393
|
+
const half alpha_f16 = 1.0f;
|
8394
|
+
const half beta_f16 = 0.0f;
|
8395
|
+
|
8396
|
+
const float alpha_f32 = 1.0f;
|
8397
|
+
const float beta_f32 = 0.0f;
|
8398
|
+
|
8399
|
+
const void * alpha = &alpha_f16;
|
8400
|
+
const void * beta = &beta_f16;
|
8401
|
+
|
8402
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8403
|
+
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8404
|
+
dst_t = (char *) dst_f16;
|
8405
|
+
|
8406
|
+
nbd2 /= sizeof(float) / sizeof(half);
|
8407
|
+
nbd3 /= sizeof(float) / sizeof(half);
|
8408
|
+
} else {
|
8409
|
+
dst_t = (char *) dst_ddf;
|
8410
|
+
|
8411
|
+
cu_compute_type = CUBLAS_COMPUTE_32F;
|
8412
|
+
cu_data_type = CUDA_R_32F;
|
8413
|
+
|
8414
|
+
alpha = &alpha_f32;
|
8415
|
+
beta = &beta_f32;
|
8416
|
+
}
|
7888
8417
|
|
7889
8418
|
GGML_ASSERT(ne12 % ne02 == 0);
|
7890
8419
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -7893,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7893
8422
|
const int64_t r2 = ne12/ne02;
|
7894
8423
|
const int64_t r3 = ne13/ne03;
|
7895
8424
|
|
7896
|
-
const half alpha_f16 = 1.0f;
|
7897
|
-
const half beta_f16 = 0.0f;
|
7898
|
-
|
7899
8425
|
#if 0
|
7900
8426
|
// use cublasGemmEx
|
7901
8427
|
{
|
@@ -7905,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7905
8431
|
int i02 = i12 / r2;
|
7906
8432
|
|
7907
8433
|
CUBLAS_CHECK(
|
7908
|
-
cublasGemmEx(g_cublas_handles[
|
8434
|
+
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7909
8435
|
ne01, ne11, ne10,
|
7910
|
-
|
7911
|
-
|
7912
|
-
|
7913
|
-
|
8436
|
+
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
8437
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
8438
|
+
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
8439
|
+
cu_compute_type,
|
7914
8440
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7915
8441
|
}
|
7916
8442
|
}
|
@@ -7922,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7922
8448
|
CUBLAS_CHECK(
|
7923
8449
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7924
8450
|
ne01, ne11, ne10,
|
7925
|
-
|
7926
|
-
|
7927
|
-
|
8451
|
+
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
8452
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
8453
|
+
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
7928
8454
|
ne12*ne13,
|
7929
|
-
|
8455
|
+
cu_compute_type,
|
7930
8456
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7931
8457
|
} else {
|
7932
8458
|
// use cublasGemmBatchedEx
|
@@ -7943,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7943
8469
|
|
7944
8470
|
dim3 block_dims(ne13, ne12);
|
7945
8471
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7946
|
-
src0_as_f16, src1_as_f16,
|
8472
|
+
src0_as_f16, src1_as_f16, dst_t,
|
7947
8473
|
ptrs_src, ptrs_dst,
|
7948
8474
|
ne12, ne13,
|
7949
8475
|
ne23,
|
7950
8476
|
nb02, nb03,
|
7951
8477
|
nb12, nb13,
|
7952
|
-
|
8478
|
+
nbd2, nbd3,
|
7953
8479
|
r2, r3);
|
7954
8480
|
CUDA_CHECK(cudaGetLastError());
|
7955
8481
|
|
7956
8482
|
CUBLAS_CHECK(
|
7957
8483
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7958
8484
|
ne01, ne11, ne10,
|
7959
|
-
|
7960
|
-
|
7961
|
-
|
8485
|
+
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
8486
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
8487
|
+
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
7962
8488
|
ne23,
|
7963
|
-
|
8489
|
+
cu_compute_type,
|
7964
8490
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7965
8491
|
|
7966
8492
|
if (ptrs_src_s != 0) {
|
@@ -7972,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7972
8498
|
}
|
7973
8499
|
#endif
|
7974
8500
|
|
7975
|
-
|
7976
|
-
|
8501
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8502
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8503
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8504
|
+
|
8505
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8506
|
+
}
|
7977
8507
|
|
7978
8508
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7979
|
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
7980
8509
|
}
|
7981
8510
|
|
7982
8511
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8234,36 +8763,145 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
8234
8763
|
}
|
8235
8764
|
#endif
|
8236
8765
|
|
8237
|
-
static void ggml_cuda_mul_mat_id(const ggml_tensor *
|
8766
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8238
8767
|
#if 0
|
8239
|
-
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
-
// const bool use_tensor_cores = true;
|
8241
|
-
//#else
|
8242
|
-
// const bool use_tensor_cores = false;
|
8243
|
-
//#endif
|
8244
|
-
|
8245
8768
|
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
-
|
8247
8769
|
// TODO: mmq/mmv support
|
8248
|
-
#
|
8249
|
-
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
-
const int id = dst->op_params[0];
|
8770
|
+
#endif
|
8252
8771
|
|
8253
|
-
|
8772
|
+
const int64_t nb11 = src1->nb[1];
|
8773
|
+
const int64_t nb1 = dst->nb[1];
|
8254
8774
|
|
8255
|
-
|
8256
|
-
|
8257
|
-
|
8775
|
+
const struct ggml_tensor * ids = src0;
|
8776
|
+
const int32_t id = ((int32_t *) dst->op_params)[0];
|
8777
|
+
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
8258
8778
|
|
8259
|
-
|
8260
|
-
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8779
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
8261
8780
|
|
8262
|
-
|
8263
|
-
|
8781
|
+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
8782
|
+
|
8783
|
+
if (ids->backend == GGML_BACKEND_GPU) {
|
8784
|
+
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8785
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
8786
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8787
|
+
} else {
|
8788
|
+
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8789
|
+
}
|
8264
8790
|
|
8265
|
-
(
|
8266
|
-
(
|
8791
|
+
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
8792
|
+
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
8793
|
+
|
8794
|
+
ggml_tensor_extra_gpu src1_row_extra;
|
8795
|
+
ggml_tensor_extra_gpu dst_row_extra;
|
8796
|
+
|
8797
|
+
ggml_tensor src1_row = *src1;
|
8798
|
+
ggml_tensor dst_row = *dst;
|
8799
|
+
|
8800
|
+
src1_row.backend = GGML_BACKEND_GPU;
|
8801
|
+
dst_row.backend = GGML_BACKEND_GPU;
|
8802
|
+
|
8803
|
+
src1_row.extra = &src1_row_extra;
|
8804
|
+
dst_row.extra = &dst_row_extra;
|
8805
|
+
|
8806
|
+
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
8807
|
+
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
8808
|
+
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
8809
|
+
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
8810
|
+
|
8811
|
+
if (src1->ne[1] == 1) {
|
8812
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8813
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8814
|
+
|
8815
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8816
|
+
//int32_t row_id;
|
8817
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8818
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8819
|
+
|
8820
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8821
|
+
|
8822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8823
|
+
|
8824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8825
|
+
|
8826
|
+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
8827
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
8828
|
+
|
8829
|
+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
8830
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
8831
|
+
|
8832
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8833
|
+
}
|
8834
|
+
} else {
|
8835
|
+
size_t as_src1, as_dst;
|
8836
|
+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
8837
|
+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
8838
|
+
|
8839
|
+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
8840
|
+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
8841
|
+
|
8842
|
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
8843
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8844
|
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
8845
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8846
|
+
|
8847
|
+
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
8848
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8849
|
+
|
8850
|
+
int64_t num_src1_rows = 0;
|
8851
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8852
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8853
|
+
|
8854
|
+
if (row_id_i != row_id) {
|
8855
|
+
continue;
|
8856
|
+
}
|
8857
|
+
|
8858
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8859
|
+
|
8860
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
8861
|
+
nb11, src1_kind, stream));
|
8862
|
+
num_src1_rows++;
|
8863
|
+
}
|
8864
|
+
|
8865
|
+
if (num_src1_rows == 0) {
|
8866
|
+
continue;
|
8867
|
+
}
|
8868
|
+
|
8869
|
+
src1_row.ne[1] = num_src1_rows;
|
8870
|
+
dst_row.ne[1] = num_src1_rows;
|
8871
|
+
|
8872
|
+
src1_row.nb[1] = nb11;
|
8873
|
+
src1_row.nb[2] = num_src1_rows*nb11;
|
8874
|
+
src1_row.nb[3] = num_src1_rows*nb11;
|
8875
|
+
|
8876
|
+
dst_row.nb[1] = nb1;
|
8877
|
+
dst_row.nb[2] = num_src1_rows*nb1;
|
8878
|
+
dst_row.nb[3] = num_src1_rows*nb1;
|
8879
|
+
|
8880
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8881
|
+
|
8882
|
+
num_src1_rows = 0;
|
8883
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8884
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8885
|
+
|
8886
|
+
if (row_id_i != row_id) {
|
8887
|
+
continue;
|
8888
|
+
}
|
8889
|
+
|
8890
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
8893
|
+
nb1, dst_kind, stream));
|
8894
|
+
num_src1_rows++;
|
8895
|
+
}
|
8896
|
+
}
|
8897
|
+
|
8898
|
+
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
8899
|
+
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
8903
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8904
|
+
}
|
8267
8905
|
}
|
8268
8906
|
|
8269
8907
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8373,6 +9011,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
8373
9011
|
(void) dst;
|
8374
9012
|
}
|
8375
9013
|
|
9014
|
+
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
9015
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
9016
|
+
|
9017
|
+
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
9018
|
+
}
|
9019
|
+
|
8376
9020
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
8377
9021
|
const int64_t nrows = ggml_nrows(tensor);
|
8378
9022
|
|
@@ -8422,13 +9066,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8422
9066
|
|
8423
9067
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
8424
9068
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8425
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8426
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
9069
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
8427
9070
|
}
|
8428
9071
|
|
8429
9072
|
char * buf;
|
8430
9073
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
8431
|
-
char * buf_host = (char*)data + offset_split;
|
9074
|
+
char * buf_host = (char *)data + offset_split;
|
8432
9075
|
|
8433
9076
|
// set padding to 0 to avoid possible NaN values
|
8434
9077
|
if (size > original_size) {
|
@@ -8450,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8450
9093
|
}
|
8451
9094
|
|
8452
9095
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
8453
|
-
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
9096
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
8454
9097
|
return;
|
8455
9098
|
}
|
8456
9099
|
|
@@ -8573,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
8573
9216
|
|
8574
9217
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
8575
9218
|
|
8576
|
-
const bool inplace =
|
8577
|
-
tensor->op == GGML_OP_VIEW;
|
9219
|
+
const bool inplace = tensor->view_src != nullptr;
|
8578
9220
|
|
8579
|
-
if (inplace && (tensor->
|
8580
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
9221
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
9222
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
8581
9223
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
8582
9224
|
size_t view_offset = 0;
|
8583
9225
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -8657,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8657
9299
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
8658
9300
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
8659
9301
|
|
8660
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
9302
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
8661
9303
|
return false;
|
8662
9304
|
}
|
8663
9305
|
|
8664
9306
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
8665
9307
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8666
9308
|
#ifndef NDEBUG
|
8667
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
9309
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8668
9310
|
#endif
|
8669
9311
|
return false;
|
8670
9312
|
}
|
@@ -8683,6 +9325,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8683
9325
|
case GGML_OP_ADD:
|
8684
9326
|
func = ggml_cuda_add;
|
8685
9327
|
break;
|
9328
|
+
case GGML_OP_ACC:
|
9329
|
+
func = ggml_cuda_acc;
|
9330
|
+
break;
|
8686
9331
|
case GGML_OP_MUL:
|
8687
9332
|
func = ggml_cuda_mul;
|
8688
9333
|
break;
|
@@ -8697,6 +9342,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8697
9342
|
case GGML_UNARY_OP_SILU:
|
8698
9343
|
func = ggml_cuda_silu;
|
8699
9344
|
break;
|
9345
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9346
|
+
func = ggml_cuda_gelu_quick;
|
9347
|
+
break;
|
9348
|
+
case GGML_UNARY_OP_TANH:
|
9349
|
+
func = ggml_cuda_tanh;
|
9350
|
+
break;
|
8700
9351
|
case GGML_UNARY_OP_RELU:
|
8701
9352
|
func = ggml_cuda_relu;
|
8702
9353
|
break;
|
@@ -8707,6 +9358,21 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8707
9358
|
case GGML_OP_NORM:
|
8708
9359
|
func = ggml_cuda_norm;
|
8709
9360
|
break;
|
9361
|
+
case GGML_OP_GROUP_NORM:
|
9362
|
+
func = ggml_cuda_group_norm;
|
9363
|
+
break;
|
9364
|
+
case GGML_OP_CONCAT:
|
9365
|
+
func = ggml_cuda_concat;
|
9366
|
+
break;
|
9367
|
+
case GGML_OP_UPSCALE:
|
9368
|
+
func = ggml_cuda_upscale;
|
9369
|
+
break;
|
9370
|
+
case GGML_OP_PAD:
|
9371
|
+
func = ggml_cuda_pad;
|
9372
|
+
break;
|
9373
|
+
case GGML_OP_LEAKY_RELU:
|
9374
|
+
func = ggml_cuda_leaky_relu;
|
9375
|
+
break;
|
8710
9376
|
case GGML_OP_RMS_NORM:
|
8711
9377
|
func = ggml_cuda_rms_norm;
|
8712
9378
|
break;
|
@@ -8729,9 +9395,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8729
9395
|
func = ggml_cuda_sqr;
|
8730
9396
|
break;
|
8731
9397
|
case GGML_OP_CLAMP:
|
8732
|
-
if (!any_on_device) {
|
8733
|
-
return false;
|
8734
|
-
}
|
8735
9398
|
func = ggml_cuda_clamp;
|
8736
9399
|
break;
|
8737
9400
|
case GGML_OP_CPY:
|
@@ -8740,6 +9403,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8740
9403
|
case GGML_OP_CONT:
|
8741
9404
|
func = ggml_cuda_dup;
|
8742
9405
|
break;
|
9406
|
+
case GGML_OP_NONE:
|
8743
9407
|
case GGML_OP_RESHAPE:
|
8744
9408
|
case GGML_OP_VIEW:
|
8745
9409
|
case GGML_OP_PERMUTE:
|
@@ -8771,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8771
9435
|
return false;
|
8772
9436
|
}
|
8773
9437
|
|
9438
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
9439
|
+
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
9440
|
+
}
|
9441
|
+
|
8774
9442
|
if (params->ith != 0) {
|
8775
9443
|
return true;
|
8776
9444
|
}
|
@@ -8844,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8844
9512
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8845
9513
|
|
8846
9514
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8847
|
-
assert(tensor->view_src->buffer->buft == buffer->buft);
|
9515
|
+
assert(tensor->view_src->buffer->buft == buffer->buft);
|
8848
9516
|
tensor->backend = tensor->view_src->backend;
|
8849
9517
|
tensor->extra = tensor->view_src->extra;
|
8850
9518
|
return;
|
@@ -8875,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8875
9543
|
}
|
8876
9544
|
|
8877
9545
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
8878
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8879
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8880
9546
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8881
9547
|
|
8882
|
-
|
9548
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8883
9549
|
|
8884
|
-
|
9550
|
+
ggml_cuda_set_device(ctx->device);
|
9551
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9552
|
+
|
9553
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
8885
9554
|
}
|
8886
9555
|
|
8887
9556
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
8888
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8889
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8890
9557
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8891
9558
|
|
9559
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9560
|
+
|
9561
|
+
ggml_cuda_set_device(ctx->device);
|
9562
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9563
|
+
|
8892
9564
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
9565
|
+
}
|
8893
9566
|
|
8894
|
-
|
9567
|
+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
9568
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9569
|
+
|
9570
|
+
ggml_cuda_set_device(ctx->device);
|
9571
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9572
|
+
|
9573
|
+
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
8895
9574
|
}
|
8896
9575
|
|
8897
9576
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
@@ -8902,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
8902
9581
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
8903
9582
|
/* .cpy_tensor_from = */ NULL,
|
8904
9583
|
/* .cpy_tensor_to = */ NULL,
|
9584
|
+
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
8905
9585
|
};
|
8906
9586
|
|
8907
9587
|
// cuda buffer type
|
@@ -8938,8 +9618,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
8938
9618
|
|
8939
9619
|
if (ggml_is_quantized(tensor->type)) {
|
8940
9620
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
9621
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
8943
9622
|
}
|
8944
9623
|
}
|
8945
9624
|
|
@@ -8954,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|
8954
9633
|
UNUSED(buft);
|
8955
9634
|
}
|
8956
9635
|
|
8957
|
-
static ggml_backend_buffer_type_i
|
9636
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
8958
9637
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
8959
9638
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
8960
9639
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
8961
9640
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
9641
|
+
/* .is_host = */ nullptr,
|
8962
9642
|
};
|
8963
9643
|
|
8964
9644
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
8965
|
-
static struct ggml_backend_buffer_type
|
8966
|
-
|
8967
|
-
|
9645
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
9646
|
+
|
9647
|
+
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
9648
|
+
|
9649
|
+
if (!ggml_backend_cuda_buffer_type_initialized) {
|
8968
9650
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
8969
|
-
|
8970
|
-
/* .iface = */
|
9651
|
+
ggml_backend_cuda_buffer_types[i] = {
|
9652
|
+
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
8971
9653
|
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
8972
9654
|
};
|
8973
9655
|
}
|
8974
|
-
|
9656
|
+
ggml_backend_cuda_buffer_type_initialized = true;
|
8975
9657
|
}
|
8976
9658
|
|
8977
|
-
return &
|
9659
|
+
return &ggml_backend_cuda_buffer_types[device];
|
8978
9660
|
}
|
8979
9661
|
|
8980
9662
|
// host buffer type
|
8981
9663
|
|
8982
9664
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8983
|
-
|
8984
|
-
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
8985
|
-
delete ctx;
|
9665
|
+
CUDA_CHECK(cudaFreeHost(buffer->context));
|
8986
9666
|
}
|
8987
9667
|
|
8988
9668
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
@@ -8995,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
8995
9675
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
8996
9676
|
|
8997
9677
|
return buffer;
|
8998
|
-
|
8999
|
-
UNUSED(buft);
|
9000
9678
|
}
|
9001
9679
|
|
9002
|
-
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9003
|
-
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9004
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9005
|
-
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9006
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9007
|
-
};
|
9008
|
-
|
9009
9680
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9010
|
-
static struct ggml_backend_buffer_type
|
9011
|
-
/* .iface = */
|
9681
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
9682
|
+
/* .iface = */ {
|
9683
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9684
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9685
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9686
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9687
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
9688
|
+
},
|
9012
9689
|
/* .context = */ nullptr,
|
9013
9690
|
};
|
9014
9691
|
|
9015
|
-
return &
|
9692
|
+
return &ggml_backend_cuda_buffer_type_host;
|
9016
9693
|
}
|
9017
9694
|
|
9018
9695
|
// backend
|
@@ -9044,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
9044
9721
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9045
9722
|
|
9046
9723
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9047
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9048
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9049
9724
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9050
9725
|
|
9051
9726
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
@@ -9055,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
9055
9730
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9056
9731
|
|
9057
9732
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9058
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9059
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9060
9733
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9061
9734
|
|
9062
9735
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
@@ -9159,6 +9832,8 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9159
9832
|
case GGML_UNARY_OP_GELU:
|
9160
9833
|
case GGML_UNARY_OP_SILU:
|
9161
9834
|
case GGML_UNARY_OP_RELU:
|
9835
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9836
|
+
case GGML_UNARY_OP_TANH:
|
9162
9837
|
return true;
|
9163
9838
|
default:
|
9164
9839
|
return false;
|
@@ -9181,6 +9856,45 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9181
9856
|
}
|
9182
9857
|
return true;
|
9183
9858
|
} break;
|
9859
|
+
case GGML_OP_GET_ROWS:
|
9860
|
+
{
|
9861
|
+
switch (op->src[0]->type) {
|
9862
|
+
case GGML_TYPE_F16:
|
9863
|
+
case GGML_TYPE_F32:
|
9864
|
+
case GGML_TYPE_Q4_0:
|
9865
|
+
case GGML_TYPE_Q4_1:
|
9866
|
+
case GGML_TYPE_Q5_0:
|
9867
|
+
case GGML_TYPE_Q5_1:
|
9868
|
+
case GGML_TYPE_Q8_0:
|
9869
|
+
return true;
|
9870
|
+
default:
|
9871
|
+
return false;
|
9872
|
+
}
|
9873
|
+
} break;
|
9874
|
+
case GGML_OP_CPY:
|
9875
|
+
{
|
9876
|
+
ggml_type src0_type = op->src[0]->type;
|
9877
|
+
ggml_type src1_type = op->src[1]->type;
|
9878
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
9879
|
+
return true;
|
9880
|
+
}
|
9881
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
9882
|
+
return true;
|
9883
|
+
}
|
9884
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
9885
|
+
return true;
|
9886
|
+
}
|
9887
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
9888
|
+
return true;
|
9889
|
+
}
|
9890
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
9891
|
+
return true;
|
9892
|
+
}
|
9893
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
9894
|
+
return true;
|
9895
|
+
}
|
9896
|
+
return false;
|
9897
|
+
} break;
|
9184
9898
|
case GGML_OP_NONE:
|
9185
9899
|
case GGML_OP_RESHAPE:
|
9186
9900
|
case GGML_OP_VIEW:
|
@@ -9188,7 +9902,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9188
9902
|
case GGML_OP_TRANSPOSE:
|
9189
9903
|
case GGML_OP_NORM:
|
9190
9904
|
case GGML_OP_REPEAT:
|
9191
|
-
case GGML_OP_GET_ROWS:
|
9192
9905
|
case GGML_OP_DUP:
|
9193
9906
|
case GGML_OP_ADD:
|
9194
9907
|
case GGML_OP_MUL:
|
@@ -9197,7 +9910,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9197
9910
|
case GGML_OP_SCALE:
|
9198
9911
|
case GGML_OP_SQR:
|
9199
9912
|
case GGML_OP_CLAMP:
|
9200
|
-
case GGML_OP_CPY:
|
9201
9913
|
case GGML_OP_CONT:
|
9202
9914
|
case GGML_OP_DIAG_MASK_INF:
|
9203
9915
|
case GGML_OP_SOFT_MAX:
|
@@ -9206,6 +9918,12 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
9206
9918
|
case GGML_OP_IM2COL:
|
9207
9919
|
case GGML_OP_SUM_ROWS:
|
9208
9920
|
case GGML_OP_ARGSORT:
|
9921
|
+
case GGML_OP_ACC:
|
9922
|
+
case GGML_OP_CONCAT:
|
9923
|
+
case GGML_OP_GROUP_NORM:
|
9924
|
+
case GGML_OP_UPSCALE:
|
9925
|
+
case GGML_OP_PAD:
|
9926
|
+
case GGML_OP_LEAKY_RELU:
|
9209
9927
|
return true;
|
9210
9928
|
default:
|
9211
9929
|
return false;
|
@@ -9264,7 +9982,9 @@ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * use
|
|
9264
9982
|
UNUSED(params);
|
9265
9983
|
}
|
9266
9984
|
|
9267
|
-
extern "C" int ggml_backend_cuda_reg_devices()
|
9985
|
+
extern "C" int ggml_backend_cuda_reg_devices();
|
9986
|
+
|
9987
|
+
int ggml_backend_cuda_reg_devices() {
|
9268
9988
|
int device_count = ggml_cuda_get_device_count();
|
9269
9989
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
9990
|
for (int i = 0; i < device_count; i++) {
|