llama_cpp 0.9.5 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
@@ -1,12 +1,15 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <atomic>
|
2
4
|
#include <cinttypes>
|
3
5
|
#include <cstddef>
|
4
6
|
#include <cstdint>
|
7
|
+
#include <float.h>
|
5
8
|
#include <limits>
|
6
9
|
#include <stdint.h>
|
7
10
|
#include <stdio.h>
|
8
|
-
#include <
|
9
|
-
|
11
|
+
#include <vector>
|
12
|
+
|
10
13
|
|
11
14
|
#if defined(GGML_USE_HIPBLAS)
|
12
15
|
#include <hip/hip_runtime.h>
|
@@ -69,6 +72,7 @@
|
|
69
72
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
73
|
#define cudaSetDevice hipSetDevice
|
71
74
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
75
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
76
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
77
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
78
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +194,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
194
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
195
|
cudaGetErrorString(err_)); \
|
192
196
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
197
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
198
|
} \
|
195
199
|
} while (0)
|
196
200
|
|
@@ -204,7 +208,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
208
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
209
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
210
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
211
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
212
|
} \
|
209
213
|
} while (0)
|
210
214
|
#else
|
@@ -216,7 +220,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
220
|
cudaGetDevice(&id); \
|
217
221
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
222
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
223
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
224
|
} \
|
221
225
|
} while (0)
|
222
226
|
#endif // CUDART_VERSION >= 11
|
@@ -433,10 +437,9 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
437
|
#define WARP_SIZE 32
|
434
438
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
439
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
440
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
441
|
#define CUDA_SILU_BLOCK_SIZE 256
|
442
|
+
#define CUDA_TANH_BLOCK_SIZE 256
|
440
443
|
#define CUDA_RELU_BLOCK_SIZE 256
|
441
444
|
#define CUDA_SQR_BLOCK_SIZE 256
|
442
445
|
#define CUDA_CPY_BLOCK_SIZE 32
|
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
449
452
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
450
453
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
451
454
|
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
455
|
+
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
456
|
+
#define CUDA_CONCAT_BLOCK_SIZE 256
|
457
|
+
#define CUDA_PAD_BLOCK_SIZE 256
|
458
|
+
#define CUDA_ACC_BLOCK_SIZE 256
|
459
|
+
#define CUDA_IM2COL_BLOCK_SIZE 256
|
452
460
|
|
453
461
|
// dmmv = dequantize_mul_mat_vec
|
454
462
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -527,40 +535,105 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
527
535
|
return x;
|
528
536
|
}
|
529
537
|
|
530
|
-
static
|
531
|
-
|
538
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
539
|
+
return b;
|
540
|
+
}
|
532
541
|
|
533
|
-
|
534
|
-
|
535
|
-
}
|
536
|
-
dst[i] = x[i] + y[i%ky];
|
542
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
543
|
+
return a + b;
|
537
544
|
}
|
538
545
|
|
539
|
-
static
|
540
|
-
|
546
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
547
|
+
return a * b;
|
548
|
+
}
|
541
549
|
|
542
|
-
|
550
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
551
|
+
return a / b;
|
552
|
+
}
|
553
|
+
|
554
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
555
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
556
|
+
int ne0, int ne1, int ne2, int ne3,
|
557
|
+
int ne10, int ne11, int ne12, int ne13,
|
558
|
+
/*int s0, */ int s1, int s2, int s3,
|
559
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
560
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
561
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
562
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
563
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
564
|
+
|
565
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
543
566
|
return;
|
544
567
|
}
|
545
|
-
|
568
|
+
|
569
|
+
const int i11 = i1 % ne11;
|
570
|
+
const int i12 = i2 % ne12;
|
571
|
+
const int i13 = i3 % ne13;
|
572
|
+
|
573
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
574
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
575
|
+
const size_t i_dst = i_src0;
|
576
|
+
|
577
|
+
const src0_t * src0_row = src0 + i_src0;
|
578
|
+
const src1_t * src1_row = src1 + i_src1;
|
579
|
+
dst_t * dst_row = dst + i_dst;
|
580
|
+
|
581
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
582
|
+
const int i10 = i0 % ne10;
|
583
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
584
|
+
}
|
546
585
|
}
|
547
586
|
|
548
|
-
|
587
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
588
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
589
|
+
int ne0, int ne1, int ne2, int ne3,
|
590
|
+
int ne10, int ne11, int ne12, int ne13,
|
591
|
+
/*int s0, */ int s1, int s2, int s3,
|
592
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
593
|
+
|
549
594
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
550
595
|
|
551
|
-
|
596
|
+
const int i3 = i/(ne2*ne1*ne0);
|
597
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
598
|
+
const int i1 = (i/ne0) % ne1;
|
599
|
+
const int i0 = i % ne0;
|
600
|
+
|
601
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
552
602
|
return;
|
553
603
|
}
|
554
|
-
dst[i] = __half2float(x[i]) + y[i];
|
555
|
-
}
|
556
604
|
|
557
|
-
|
558
|
-
const int
|
605
|
+
const int i11 = i1 % ne11;
|
606
|
+
const int i12 = i2 % ne12;
|
607
|
+
const int i13 = i3 % ne13;
|
608
|
+
|
609
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
610
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
611
|
+
const size_t i_dst = i_src0;
|
612
|
+
|
613
|
+
const src0_t * src0_row = src0 + i_src0;
|
614
|
+
const src1_t * src1_row = src1 + i_src1;
|
615
|
+
dst_t * dst_row = dst + i_dst;
|
616
|
+
|
617
|
+
const int i10 = i0 % ne10;
|
618
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
619
|
+
}
|
559
620
|
|
560
|
-
|
621
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
622
|
+
const int ne10, const int ne11, const int ne12,
|
623
|
+
const int nb1, const int nb2, int offset) {
|
624
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
625
|
+
if (i >= ne) {
|
561
626
|
return;
|
562
627
|
}
|
563
|
-
|
628
|
+
int src1_idx = i - offset;
|
629
|
+
int oz = src1_idx / nb2;
|
630
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
631
|
+
int ox = src1_idx % nb1;
|
632
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
633
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
634
|
+
} else {
|
635
|
+
dst[i] = x[i];
|
636
|
+
}
|
564
637
|
}
|
565
638
|
|
566
639
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -585,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
585
658
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
586
659
|
}
|
587
660
|
|
661
|
+
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
662
|
+
const float GELU_QUICK_COEF = -1.702f;
|
663
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
664
|
+
if (i >= k) {
|
665
|
+
return;
|
666
|
+
}
|
667
|
+
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
668
|
+
}
|
669
|
+
|
670
|
+
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
671
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
672
|
+
if (i >= k) {
|
673
|
+
return;
|
674
|
+
}
|
675
|
+
dst[i] = tanhf(x[i]);
|
676
|
+
}
|
677
|
+
|
588
678
|
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
589
679
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
590
680
|
|
@@ -594,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
594
684
|
dst[i] = fmaxf(x[i], 0);
|
595
685
|
}
|
596
686
|
|
687
|
+
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
688
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
689
|
+
if (i >= k) {
|
690
|
+
return;
|
691
|
+
}
|
692
|
+
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
|
693
|
+
}
|
694
|
+
|
597
695
|
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
598
696
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
599
697
|
|
@@ -604,12 +702,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
604
702
|
}
|
605
703
|
|
606
704
|
template <int block_size>
|
607
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
705
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
608
706
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
609
707
|
const int tid = threadIdx.x;
|
610
708
|
|
611
|
-
const float eps = 1e-5f;
|
612
|
-
|
613
709
|
float2 mean_var = make_float2(0.f, 0.f);
|
614
710
|
|
615
711
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -641,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
641
737
|
}
|
642
738
|
}
|
643
739
|
|
740
|
+
static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
|
741
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
742
|
+
if (nidx >= ne0) {
|
743
|
+
return;
|
744
|
+
}
|
745
|
+
// operation
|
746
|
+
int offset_dst =
|
747
|
+
nidx +
|
748
|
+
blockIdx.y * ne0 +
|
749
|
+
blockIdx.z * ne0 * gridDim.y;
|
750
|
+
if (blockIdx.z < ne02) { // src0
|
751
|
+
int offset_src =
|
752
|
+
nidx +
|
753
|
+
blockIdx.y * ne0 +
|
754
|
+
blockIdx.z * ne0 * gridDim.y;
|
755
|
+
dst[offset_dst] = x[offset_src];
|
756
|
+
} else {
|
757
|
+
int offset_src =
|
758
|
+
nidx +
|
759
|
+
blockIdx.y * ne0 +
|
760
|
+
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
761
|
+
dst[offset_dst] = y[offset_src];
|
762
|
+
}
|
763
|
+
}
|
764
|
+
|
765
|
+
static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
|
766
|
+
int ne0 = ne00 * scale_factor;
|
767
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
768
|
+
if (nidx >= ne0) {
|
769
|
+
return;
|
770
|
+
}
|
771
|
+
// operation
|
772
|
+
int i00 = nidx / scale_factor;
|
773
|
+
int i01 = blockIdx.y / scale_factor;
|
774
|
+
int offset_src =
|
775
|
+
i00 +
|
776
|
+
i01 * ne00 +
|
777
|
+
blockIdx.z * nb02;
|
778
|
+
int offset_dst =
|
779
|
+
nidx +
|
780
|
+
blockIdx.y * ne0 +
|
781
|
+
blockIdx.z * ne0 * gridDim.y;
|
782
|
+
dst[offset_dst] = x[offset_src];
|
783
|
+
}
|
784
|
+
|
785
|
+
static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
786
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
787
|
+
if (nidx >= ne0) {
|
788
|
+
return;
|
789
|
+
}
|
790
|
+
|
791
|
+
// operation
|
792
|
+
int offset_dst =
|
793
|
+
nidx +
|
794
|
+
blockIdx.y * ne0 +
|
795
|
+
blockIdx.z * ne0 * gridDim.y;
|
796
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
797
|
+
int offset_src =
|
798
|
+
nidx +
|
799
|
+
blockIdx.y * ne00 +
|
800
|
+
blockIdx.z * ne00 * ne01;
|
801
|
+
dst[offset_dst] = x[offset_src];
|
802
|
+
} else {
|
803
|
+
dst[offset_dst] = 0.0f;
|
804
|
+
}
|
805
|
+
}
|
806
|
+
|
807
|
+
template <int block_size>
|
808
|
+
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
809
|
+
int start = blockIdx.x * group_size;
|
810
|
+
int end = start + group_size;
|
811
|
+
|
812
|
+
start += threadIdx.x;
|
813
|
+
|
814
|
+
if (end >= ne_elements) {
|
815
|
+
end = ne_elements;
|
816
|
+
}
|
817
|
+
|
818
|
+
float tmp = 0.0f; // partial sum for thread in warp
|
819
|
+
|
820
|
+
for (int j = start; j < end; j += block_size) {
|
821
|
+
tmp += x[j];
|
822
|
+
}
|
823
|
+
|
824
|
+
tmp = warp_reduce_sum(tmp);
|
825
|
+
if (block_size > WARP_SIZE) {
|
826
|
+
__shared__ float s_sum[32];
|
827
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
828
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
829
|
+
if (lane_id == 0) {
|
830
|
+
s_sum[warp_id] = tmp;
|
831
|
+
}
|
832
|
+
__syncthreads();
|
833
|
+
tmp = s_sum[lane_id];
|
834
|
+
tmp = warp_reduce_sum(tmp);
|
835
|
+
}
|
836
|
+
|
837
|
+
float mean = tmp / group_size;
|
838
|
+
tmp = 0.0f;
|
839
|
+
|
840
|
+
for (int j = start; j < end; j += block_size) {
|
841
|
+
float xi = x[j] - mean;
|
842
|
+
dst[j] = xi;
|
843
|
+
tmp += xi * xi;
|
844
|
+
}
|
845
|
+
|
846
|
+
tmp = warp_reduce_sum(tmp);
|
847
|
+
if (block_size > WARP_SIZE) {
|
848
|
+
__shared__ float s_sum[32];
|
849
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
850
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
851
|
+
if (lane_id == 0) {
|
852
|
+
s_sum[warp_id] = tmp;
|
853
|
+
}
|
854
|
+
__syncthreads();
|
855
|
+
tmp = s_sum[lane_id];
|
856
|
+
tmp = warp_reduce_sum(tmp);
|
857
|
+
}
|
858
|
+
|
859
|
+
float variance = tmp / group_size;
|
860
|
+
float scale = rsqrtf(variance + eps);
|
861
|
+
for (int j = start; j < end; j += block_size) {
|
862
|
+
dst[j] *= scale;
|
863
|
+
}
|
864
|
+
}
|
865
|
+
|
644
866
|
template <int block_size>
|
645
867
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
646
868
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -1639,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1639
1861
|
}
|
1640
1862
|
|
1641
1863
|
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1642
|
-
static __global__ void k_get_rows(
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1864
|
+
static __global__ void k_get_rows(
|
1865
|
+
const void * src0, const int32_t * src1, dst_t * dst,
|
1866
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1867
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1868
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1869
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1870
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1871
|
+
|
1872
|
+
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1873
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1874
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1875
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1876
|
+
|
1877
|
+
if (i00 >= ne00) {
|
1647
1878
|
return;
|
1648
1879
|
}
|
1649
1880
|
|
1650
|
-
const int
|
1881
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1651
1882
|
|
1652
|
-
|
1653
|
-
const
|
1654
|
-
const int di = row*ncols + col;
|
1883
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1884
|
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
1655
1885
|
|
1656
|
-
const int ib =
|
1657
|
-
const int iqs = (
|
1658
|
-
const int iybs =
|
1886
|
+
const int ib = i00/qk; // block index
|
1887
|
+
const int iqs = (i00%qk)/qr; // quant index
|
1888
|
+
const int iybs = i00 - i00%qk; // dst block start index
|
1659
1889
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1660
1890
|
|
1661
1891
|
// dequantize
|
1662
1892
|
dfloat2 v;
|
1663
|
-
dequantize_kernel(
|
1893
|
+
dequantize_kernel(src0_row, ib, iqs, v);
|
1894
|
+
|
1895
|
+
dst_row[iybs + iqs + 0] = v.x;
|
1896
|
+
dst_row[iybs + iqs + y_offset] = v.y;
|
1897
|
+
}
|
1898
|
+
|
1899
|
+
template<typename src0_t, typename dst_t>
|
1900
|
+
static __global__ void k_get_rows_float(
|
1901
|
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
1902
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1903
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1904
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1905
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1906
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1907
|
+
|
1908
|
+
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
1909
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1910
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1911
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1664
1912
|
|
1665
|
-
|
1666
|
-
|
1913
|
+
if (i00 >= ne00) {
|
1914
|
+
return;
|
1915
|
+
}
|
1916
|
+
|
1917
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1918
|
+
|
1919
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1920
|
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
1921
|
+
|
1922
|
+
dst_row[i00] = src0_row[i00];
|
1667
1923
|
}
|
1668
1924
|
|
1669
1925
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
@@ -4559,6 +4815,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4559
4815
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4560
4816
|
}
|
4561
4817
|
|
4818
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4819
|
+
const float * xi = (const float *) cxi;
|
4820
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4821
|
+
|
4822
|
+
float amax = 0.0f; // absolute max
|
4823
|
+
|
4824
|
+
for (int j = 0; j < QK8_0; j++) {
|
4825
|
+
const float v = xi[j];
|
4826
|
+
amax = fmaxf(amax, fabsf(v));
|
4827
|
+
}
|
4828
|
+
|
4829
|
+
const float d = amax / ((1 << 7) - 1);
|
4830
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4831
|
+
|
4832
|
+
dsti->d = d;
|
4833
|
+
|
4834
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4835
|
+
const float x0 = xi[j]*id;
|
4836
|
+
|
4837
|
+
dsti->qs[j] = roundf(x0);
|
4838
|
+
}
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4842
|
+
const float * xi = (const float *) cxi;
|
4843
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4844
|
+
|
4845
|
+
float amax = 0.0f;
|
4846
|
+
float vmax = 0.0f;
|
4847
|
+
|
4848
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4849
|
+
const float v = xi[j];
|
4850
|
+
if (amax < fabsf(v)) {
|
4851
|
+
amax = fabsf(v);
|
4852
|
+
vmax = v;
|
4853
|
+
}
|
4854
|
+
}
|
4855
|
+
|
4856
|
+
const float d = vmax / -8;
|
4857
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4858
|
+
|
4859
|
+
dsti->d = d;
|
4860
|
+
|
4861
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4862
|
+
const float x0 = xi[0 + j]*id;
|
4863
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4864
|
+
|
4865
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4866
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4867
|
+
|
4868
|
+
dsti->qs[j] = xi0;
|
4869
|
+
dsti->qs[j] |= xi1 << 4;
|
4870
|
+
}
|
4871
|
+
}
|
4872
|
+
|
4873
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4874
|
+
const float * xi = (const float *) cxi;
|
4875
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4876
|
+
|
4877
|
+
float vmin = FLT_MAX;
|
4878
|
+
float vmax = -FLT_MAX;
|
4879
|
+
|
4880
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4881
|
+
const float v = xi[j];
|
4882
|
+
|
4883
|
+
if (v < vmin) vmin = v;
|
4884
|
+
if (v > vmax) vmax = v;
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4888
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4889
|
+
|
4890
|
+
dsti->dm.x = d;
|
4891
|
+
dsti->dm.y = vmin;
|
4892
|
+
|
4893
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4894
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4895
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4896
|
+
|
4897
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4898
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4899
|
+
|
4900
|
+
dsti->qs[j] = xi0;
|
4901
|
+
dsti->qs[j] |= xi1 << 4;
|
4902
|
+
}
|
4903
|
+
}
|
4904
|
+
|
4905
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4906
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4907
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4908
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4909
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4910
|
+
|
4911
|
+
if (i >= ne) {
|
4912
|
+
return;
|
4913
|
+
}
|
4914
|
+
|
4915
|
+
const int i02 = i / (ne00*ne01);
|
4916
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4917
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4918
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4919
|
+
|
4920
|
+
const int i12 = i / (ne10*ne11);
|
4921
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4922
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4923
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4924
|
+
|
4925
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4926
|
+
}
|
4927
|
+
|
4562
4928
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4563
4929
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4564
4930
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4713,6 +5079,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4713
5079
|
dst[i] = col * m_k + x[i];
|
4714
5080
|
}
|
4715
5081
|
|
5082
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
5083
|
+
const int row = blockIdx.y;
|
5084
|
+
const int col = threadIdx.x;
|
5085
|
+
|
5086
|
+
float sum = 0.0f;
|
5087
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
5088
|
+
sum += x[row * ncols + i];
|
5089
|
+
}
|
5090
|
+
|
5091
|
+
sum = warp_reduce_sum(sum);
|
5092
|
+
|
5093
|
+
if (col == 0) {
|
5094
|
+
dst[row] = sum;
|
5095
|
+
}
|
5096
|
+
}
|
5097
|
+
|
5098
|
+
template<typename T>
|
5099
|
+
static inline __device__ void swap(T & a, T & b) {
|
5100
|
+
T tmp = a;
|
5101
|
+
a = b;
|
5102
|
+
b = tmp;
|
5103
|
+
}
|
5104
|
+
|
5105
|
+
template<ggml_sort_order order>
|
5106
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
5107
|
+
// bitonic sort
|
5108
|
+
int col = threadIdx.x;
|
5109
|
+
int row = blockIdx.y;
|
5110
|
+
|
5111
|
+
if (col >= ncols) return;
|
5112
|
+
|
5113
|
+
const float * x_row = x + row * ncols;
|
5114
|
+
int * dst_row = dst + row * ncols;
|
5115
|
+
|
5116
|
+
// initialize indices
|
5117
|
+
if (col < ncols) {
|
5118
|
+
dst_row[col] = col;
|
5119
|
+
}
|
5120
|
+
__syncthreads();
|
5121
|
+
|
5122
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
5123
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
5124
|
+
int ixj = col ^ j;
|
5125
|
+
if (ixj > col) {
|
5126
|
+
if ((col & k) == 0) {
|
5127
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
5128
|
+
swap(dst_row[col], dst_row[ixj]);
|
5129
|
+
}
|
5130
|
+
} else {
|
5131
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
5132
|
+
swap(dst_row[col], dst_row[ixj]);
|
5133
|
+
}
|
5134
|
+
}
|
5135
|
+
}
|
5136
|
+
__syncthreads();
|
5137
|
+
}
|
5138
|
+
}
|
5139
|
+
}
|
5140
|
+
|
4716
5141
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4717
5142
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4718
5143
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4722,8 +5147,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4722
5147
|
}
|
4723
5148
|
|
4724
5149
|
const int i = row*ncols + col;
|
4725
|
-
//
|
4726
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
5150
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
5151
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
5152
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4727
5153
|
}
|
4728
5154
|
|
4729
5155
|
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
@@ -4820,49 +5246,220 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4820
5246
|
|
4821
5247
|
static __global__ void im2col_f32_f16(
|
4822
5248
|
const float * x, half * dst,
|
4823
|
-
int
|
5249
|
+
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
4824
5250
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4825
|
-
const int
|
4826
|
-
|
5251
|
+
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
5252
|
+
if (i >= pelements) {
|
5253
|
+
return;
|
5254
|
+
}
|
5255
|
+
|
5256
|
+
const int ksize = OW * (KH > 1 ? KW : 1);
|
5257
|
+
const int kx = i / ksize;
|
5258
|
+
const int kd = kx * ksize;
|
5259
|
+
const int ky = (i - kd) / OW;
|
5260
|
+
const int ix = i % OW;
|
5261
|
+
|
5262
|
+
const int iiw = ix * s0 + kx * d0 - p0;
|
5263
|
+
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
4827
5264
|
|
4828
5265
|
const int offset_dst =
|
4829
|
-
(
|
4830
|
-
(blockIdx.
|
5266
|
+
(blockIdx.y * OW + ix) * CHW +
|
5267
|
+
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
4831
5268
|
|
4832
5269
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4833
5270
|
dst[offset_dst] = __float2half(0.0f);
|
4834
5271
|
} else {
|
4835
|
-
const int offset_src =
|
5272
|
+
const int offset_src = blockIdx.z * offset_delta;
|
4836
5273
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4837
5274
|
}
|
4838
5275
|
}
|
4839
5276
|
|
4840
5277
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4841
|
-
static void get_rows_cuda(const
|
5278
|
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5279
|
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5280
|
+
|
5281
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5282
|
+
|
4842
5283
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4843
|
-
const int block_num_x = (
|
4844
|
-
const dim3 block_nums(block_num_x,
|
4845
|
-
|
4846
|
-
|
5284
|
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
5285
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5286
|
+
|
5287
|
+
// strides in elements
|
5288
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5289
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5290
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5291
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5292
|
+
|
5293
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5294
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5295
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5296
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5297
|
+
|
5298
|
+
GGML_ASSERT(ne00 % 2 == 0);
|
5299
|
+
|
5300
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
5301
|
+
src0_dd, src1_dd, dst_dd,
|
5302
|
+
ne00, /*ne01, ne02, ne03,*/
|
5303
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5304
|
+
/* s0,*/ s1, s2, s3,
|
5305
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5306
|
+
s10, s11, s12/*, s13*/);
|
4847
5307
|
|
4848
|
-
|
4849
|
-
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4850
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
5308
|
+
(void) dst;
|
4851
5309
|
}
|
4852
5310
|
|
4853
|
-
|
4854
|
-
|
4855
|
-
|
4856
|
-
|
5311
|
+
template<typename src0_t>
|
5312
|
+
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5313
|
+
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5314
|
+
|
5315
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
4857
5316
|
|
4858
|
-
|
4859
|
-
const int
|
4860
|
-
|
5317
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5318
|
+
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
5319
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5320
|
+
|
5321
|
+
// strides in elements
|
5322
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5323
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5324
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5325
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5326
|
+
|
5327
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5328
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5329
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5330
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5331
|
+
|
5332
|
+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
5333
|
+
src0_dd, src1_dd, dst_dd,
|
5334
|
+
ne00, /*ne01, ne02, ne03,*/
|
5335
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5336
|
+
/* s0,*/ s1, s2, s3,
|
5337
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5338
|
+
s10, s11, s12/*, s13*/);
|
5339
|
+
|
5340
|
+
(void) dst;
|
4861
5341
|
}
|
4862
5342
|
|
4863
|
-
|
4864
|
-
|
4865
|
-
|
5343
|
+
template<float (*bin_op)(const float, const float)>
|
5344
|
+
struct bin_bcast_cuda {
|
5345
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5346
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5347
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5348
|
+
cudaStream_t stream) {
|
5349
|
+
|
5350
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5351
|
+
|
5352
|
+
int nr0 = ne10/ne0;
|
5353
|
+
int nr1 = ne11/ne1;
|
5354
|
+
int nr2 = ne12/ne2;
|
5355
|
+
int nr3 = ne13/ne3;
|
5356
|
+
|
5357
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5358
|
+
|
5359
|
+
// collapse dimensions until first broadcast dimension
|
5360
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5361
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5362
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5363
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5364
|
+
auto collapse = [](int64_t cne[]) {
|
5365
|
+
cne[0] *= cne[1];
|
5366
|
+
cne[1] = cne[2];
|
5367
|
+
cne[2] = cne[3];
|
5368
|
+
cne[3] = 1;
|
5369
|
+
};
|
5370
|
+
|
5371
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5372
|
+
cnb[1] *= cne[1];
|
5373
|
+
cnb[2] *= cne[2];
|
5374
|
+
cnb[3] *= cne[3];
|
5375
|
+
};
|
5376
|
+
|
5377
|
+
for (int i = 0; i < 4; i++) {
|
5378
|
+
if (nr[i] != 1) {
|
5379
|
+
break;
|
5380
|
+
}
|
5381
|
+
if (i > 0) {
|
5382
|
+
collapse_nb(cnb0, cne0);
|
5383
|
+
collapse_nb(cnb1, cne1);
|
5384
|
+
collapse(cne0);
|
5385
|
+
collapse(cne1);
|
5386
|
+
}
|
5387
|
+
}
|
5388
|
+
{
|
5389
|
+
int64_t ne0 = cne0[0];
|
5390
|
+
int64_t ne1 = cne0[1];
|
5391
|
+
int64_t ne2 = cne0[2];
|
5392
|
+
int64_t ne3 = cne0[3];
|
5393
|
+
|
5394
|
+
int64_t ne10 = cne1[0];
|
5395
|
+
int64_t ne11 = cne1[1];
|
5396
|
+
int64_t ne12 = cne1[2];
|
5397
|
+
int64_t ne13 = cne1[3];
|
5398
|
+
|
5399
|
+
size_t nb0 = cnb0[0];
|
5400
|
+
size_t nb1 = cnb0[1];
|
5401
|
+
size_t nb2 = cnb0[2];
|
5402
|
+
size_t nb3 = cnb0[3];
|
5403
|
+
|
5404
|
+
size_t nb10 = cnb1[0];
|
5405
|
+
size_t nb11 = cnb1[1];
|
5406
|
+
size_t nb12 = cnb1[2];
|
5407
|
+
size_t nb13 = cnb1[3];
|
5408
|
+
|
5409
|
+
size_t s0 = nb0 / sizeof(dst_t);
|
5410
|
+
size_t s1 = nb1 / sizeof(dst_t);
|
5411
|
+
size_t s2 = nb2 / sizeof(dst_t);
|
5412
|
+
size_t s3 = nb3 / sizeof(dst_t);
|
5413
|
+
|
5414
|
+
size_t s10 = nb10 / sizeof(src1_t);
|
5415
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5416
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5417
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5418
|
+
|
5419
|
+
GGML_ASSERT(s0 == 1);
|
5420
|
+
GGML_ASSERT(s10 == 1);
|
5421
|
+
|
5422
|
+
const int block_size = 128;
|
5423
|
+
|
5424
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5425
|
+
|
5426
|
+
dim3 block_dims;
|
5427
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5428
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5429
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5430
|
+
|
5431
|
+
dim3 block_nums(
|
5432
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5433
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5434
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5435
|
+
);
|
5436
|
+
|
5437
|
+
if (block_nums.z > 65535) {
|
5438
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5439
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5440
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5441
|
+
src0_dd, src1_dd, dst_dd,
|
5442
|
+
ne0, ne1, ne2, ne3,
|
5443
|
+
ne10, ne11, ne12, ne13,
|
5444
|
+
/* s0, */ s1, s2, s3,
|
5445
|
+
/* s10, */ s11, s12, s13);
|
5446
|
+
} else {
|
5447
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5448
|
+
src0_dd, src1_dd, dst_dd,
|
5449
|
+
ne0, ne1, ne2, ne3,
|
5450
|
+
ne10, ne11, ne12, ne13,
|
5451
|
+
/* s0, */ s1, s2, s3,
|
5452
|
+
/* s10, */ s11, s12, s13);
|
5453
|
+
}
|
5454
|
+
}
|
5455
|
+
}
|
5456
|
+
};
|
5457
|
+
|
5458
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
5459
|
+
const int ne10, const int ne11, const int ne12,
|
5460
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
5461
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
5462
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
4866
5463
|
}
|
4867
5464
|
|
4868
5465
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
@@ -4875,27 +5472,74 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4875
5472
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4876
5473
|
}
|
4877
5474
|
|
5475
|
+
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5476
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5477
|
+
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5478
|
+
}
|
5479
|
+
|
5480
|
+
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5481
|
+
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
5482
|
+
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5483
|
+
}
|
5484
|
+
|
4878
5485
|
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4879
5486
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4880
5487
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4881
5488
|
}
|
4882
5489
|
|
5490
|
+
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
5491
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5492
|
+
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
5493
|
+
}
|
5494
|
+
|
4883
5495
|
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4884
5496
|
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4885
5497
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4886
5498
|
}
|
4887
5499
|
|
4888
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5500
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4889
5501
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4890
5502
|
if (ncols < 1024) {
|
4891
5503
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4892
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5504
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
5505
|
+
} else {
|
5506
|
+
const dim3 block_dims(1024, 1, 1);
|
5507
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
5508
|
+
}
|
5509
|
+
}
|
5510
|
+
|
5511
|
+
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
5512
|
+
static const float eps = 1e-6f;
|
5513
|
+
if (group_size < 1024) {
|
5514
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
5515
|
+
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
4893
5516
|
} else {
|
4894
5517
|
const dim3 block_dims(1024, 1, 1);
|
4895
|
-
|
5518
|
+
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
4896
5519
|
}
|
4897
5520
|
}
|
4898
5521
|
|
5522
|
+
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
5523
|
+
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
5524
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5525
|
+
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
5526
|
+
}
|
5527
|
+
|
5528
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
|
5529
|
+
int ne0 = (ne00 * scale_factor);
|
5530
|
+
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
5531
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
5532
|
+
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
static void pad_f32_cuda(const float * x, float * dst,
|
5536
|
+
const int ne00, const int ne01, const int ne02,
|
5537
|
+
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
5538
|
+
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
5539
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5540
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
5541
|
+
}
|
5542
|
+
|
4899
5543
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4900
5544
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4901
5545
|
if (ncols < 1024) {
|
@@ -4914,34 +5558,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4914
5558
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4915
5559
|
}
|
4916
5560
|
|
4917
|
-
template<typename dst_t>
|
4918
|
-
static void
|
4919
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4920
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4921
|
-
}
|
4922
|
-
|
4923
|
-
template<typename dst_t>
|
4924
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4925
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4926
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4927
|
-
}
|
4928
|
-
|
4929
|
-
template<typename dst_t>
|
4930
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4931
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4932
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4933
|
-
}
|
4934
|
-
|
4935
|
-
template<typename dst_t>
|
4936
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4937
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4938
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4939
|
-
}
|
4940
|
-
|
4941
|
-
template<typename dst_t>
|
4942
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5561
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5562
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4943
5563
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4944
|
-
dequantize_block<
|
5564
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4945
5565
|
}
|
4946
5566
|
|
4947
5567
|
template<typename dst_t>
|
@@ -4990,6 +5610,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4990
5610
|
#endif
|
4991
5611
|
}
|
4992
5612
|
|
5613
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5614
|
+
switch (type) {
|
5615
|
+
case GGML_TYPE_Q4_0:
|
5616
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5617
|
+
case GGML_TYPE_Q4_1:
|
5618
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5619
|
+
case GGML_TYPE_Q5_0:
|
5620
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5621
|
+
case GGML_TYPE_Q5_1:
|
5622
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5623
|
+
case GGML_TYPE_Q8_0:
|
5624
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5625
|
+
case GGML_TYPE_Q2_K:
|
5626
|
+
return dequantize_row_q2_K_cuda;
|
5627
|
+
case GGML_TYPE_Q3_K:
|
5628
|
+
return dequantize_row_q3_K_cuda;
|
5629
|
+
case GGML_TYPE_Q4_K:
|
5630
|
+
return dequantize_row_q4_K_cuda;
|
5631
|
+
case GGML_TYPE_Q5_K:
|
5632
|
+
return dequantize_row_q5_K_cuda;
|
5633
|
+
case GGML_TYPE_Q6_K:
|
5634
|
+
return dequantize_row_q6_K_cuda;
|
5635
|
+
case GGML_TYPE_F32:
|
5636
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5637
|
+
default:
|
5638
|
+
return nullptr;
|
5639
|
+
}
|
5640
|
+
}
|
5641
|
+
|
5642
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5643
|
+
switch (type) {
|
5644
|
+
case GGML_TYPE_Q4_0:
|
5645
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5646
|
+
case GGML_TYPE_Q4_1:
|
5647
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5648
|
+
case GGML_TYPE_Q5_0:
|
5649
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5650
|
+
case GGML_TYPE_Q5_1:
|
5651
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5652
|
+
case GGML_TYPE_Q8_0:
|
5653
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5654
|
+
case GGML_TYPE_Q2_K:
|
5655
|
+
return dequantize_row_q2_K_cuda;
|
5656
|
+
case GGML_TYPE_Q3_K:
|
5657
|
+
return dequantize_row_q3_K_cuda;
|
5658
|
+
case GGML_TYPE_Q4_K:
|
5659
|
+
return dequantize_row_q4_K_cuda;
|
5660
|
+
case GGML_TYPE_Q5_K:
|
5661
|
+
return dequantize_row_q5_K_cuda;
|
5662
|
+
case GGML_TYPE_Q6_K:
|
5663
|
+
return dequantize_row_q6_K_cuda;
|
5664
|
+
case GGML_TYPE_F16:
|
5665
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5666
|
+
default:
|
5667
|
+
return nullptr;
|
5668
|
+
}
|
5669
|
+
}
|
5670
|
+
|
4993
5671
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4994
5672
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4995
5673
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5078,6 +5756,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5078
5756
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5079
5757
|
}
|
5080
5758
|
|
5759
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5760
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5761
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5762
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5763
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5764
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5765
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5766
|
+
}
|
5767
|
+
|
5081
5768
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5082
5769
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
5083
5770
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5168,83 +5855,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5168
5855
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5169
5856
|
}
|
5170
5857
|
|
5171
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5172
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5173
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5174
|
-
}
|
5175
|
-
|
5176
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5177
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5178
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5179
|
-
}
|
5180
|
-
|
5181
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5182
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5183
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5184
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5185
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5186
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5187
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5188
|
-
}
|
5189
|
-
|
5190
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5191
|
-
switch (type) {
|
5192
|
-
case GGML_TYPE_Q4_0:
|
5193
|
-
return dequantize_row_q4_0_cuda;
|
5194
|
-
case GGML_TYPE_Q4_1:
|
5195
|
-
return dequantize_row_q4_1_cuda;
|
5196
|
-
case GGML_TYPE_Q5_0:
|
5197
|
-
return dequantize_row_q5_0_cuda;
|
5198
|
-
case GGML_TYPE_Q5_1:
|
5199
|
-
return dequantize_row_q5_1_cuda;
|
5200
|
-
case GGML_TYPE_Q8_0:
|
5201
|
-
return dequantize_row_q8_0_cuda;
|
5202
|
-
case GGML_TYPE_Q2_K:
|
5203
|
-
return dequantize_row_q2_K_cuda;
|
5204
|
-
case GGML_TYPE_Q3_K:
|
5205
|
-
return dequantize_row_q3_K_cuda;
|
5206
|
-
case GGML_TYPE_Q4_K:
|
5207
|
-
return dequantize_row_q4_K_cuda;
|
5208
|
-
case GGML_TYPE_Q5_K:
|
5209
|
-
return dequantize_row_q5_K_cuda;
|
5210
|
-
case GGML_TYPE_Q6_K:
|
5211
|
-
return dequantize_row_q6_K_cuda;
|
5212
|
-
case GGML_TYPE_F32:
|
5213
|
-
return convert_fp32_to_fp16_cuda;
|
5214
|
-
default:
|
5215
|
-
return nullptr;
|
5216
|
-
}
|
5217
|
-
}
|
5218
|
-
|
5219
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5220
|
-
switch (type) {
|
5221
|
-
case GGML_TYPE_Q4_0:
|
5222
|
-
return dequantize_row_q4_0_cuda;
|
5223
|
-
case GGML_TYPE_Q4_1:
|
5224
|
-
return dequantize_row_q4_1_cuda;
|
5225
|
-
case GGML_TYPE_Q5_0:
|
5226
|
-
return dequantize_row_q5_0_cuda;
|
5227
|
-
case GGML_TYPE_Q5_1:
|
5228
|
-
return dequantize_row_q5_1_cuda;
|
5229
|
-
case GGML_TYPE_Q8_0:
|
5230
|
-
return dequantize_row_q8_0_cuda;
|
5231
|
-
case GGML_TYPE_Q2_K:
|
5232
|
-
return dequantize_row_q2_K_cuda;
|
5233
|
-
case GGML_TYPE_Q3_K:
|
5234
|
-
return dequantize_row_q3_K_cuda;
|
5235
|
-
case GGML_TYPE_Q4_K:
|
5236
|
-
return dequantize_row_q4_K_cuda;
|
5237
|
-
case GGML_TYPE_Q5_K:
|
5238
|
-
return dequantize_row_q5_K_cuda;
|
5239
|
-
case GGML_TYPE_Q6_K:
|
5240
|
-
return dequantize_row_q6_K_cuda;
|
5241
|
-
case GGML_TYPE_F16:
|
5242
|
-
return convert_fp16_to_fp32_cuda;
|
5243
|
-
default:
|
5244
|
-
return nullptr;
|
5245
|
-
}
|
5246
|
-
}
|
5247
|
-
|
5248
5858
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5249
5859
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5250
5860
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5737,6 +6347,39 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5737
6347
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5738
6348
|
}
|
5739
6349
|
|
6350
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
6351
|
+
const char * cx, char * cdst, const int ne,
|
6352
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6353
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6354
|
+
|
6355
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6356
|
+
const int num_blocks = ne / QK8_0;
|
6357
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
6358
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6359
|
+
}
|
6360
|
+
|
6361
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6362
|
+
const char * cx, char * cdst, const int ne,
|
6363
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6364
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6365
|
+
|
6366
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6367
|
+
const int num_blocks = ne / QK4_0;
|
6368
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6369
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6370
|
+
}
|
6371
|
+
|
6372
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6373
|
+
const char * cx, char * cdst, const int ne,
|
6374
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6375
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6376
|
+
|
6377
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6378
|
+
const int num_blocks = ne / QK4_1;
|
6379
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6380
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6381
|
+
}
|
6382
|
+
|
5740
6383
|
static void ggml_cpy_f16_f16_cuda(
|
5741
6384
|
const char * cx, char * cdst, const int ne,
|
5742
6385
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -5823,6 +6466,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5823
6466
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5824
6467
|
}
|
5825
6468
|
|
6469
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6470
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6471
|
+
const dim3 block_nums(1, nrows, 1);
|
6472
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6473
|
+
}
|
6474
|
+
|
6475
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6476
|
+
// bitonic sort requires ncols to be power of 2
|
6477
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6478
|
+
|
6479
|
+
const dim3 block_dims(ncols, 1, 1);
|
6480
|
+
const dim3 block_nums(1, nrows, 1);
|
6481
|
+
if (order == GGML_SORT_ASC) {
|
6482
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6483
|
+
} else if (order == GGML_SORT_DESC) {
|
6484
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6485
|
+
} else {
|
6486
|
+
GGML_ASSERT(false);
|
6487
|
+
}
|
6488
|
+
}
|
6489
|
+
|
5826
6490
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5827
6491
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5828
6492
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5838,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
5838
6502
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5839
6503
|
}
|
5840
6504
|
|
5841
|
-
static void im2col_f32_f16_cuda(const float
|
5842
|
-
int
|
5843
|
-
int
|
5844
|
-
int s0,
|
5845
|
-
|
5846
|
-
|
5847
|
-
|
6505
|
+
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
6506
|
+
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
6507
|
+
int offset_delta,
|
6508
|
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
6509
|
+
const int parallel_elements = OW * KW * KH;
|
6510
|
+
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
6511
|
+
dim3 block_nums(num_blocks, OH, IC);
|
6512
|
+
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5848
6513
|
}
|
5849
6514
|
|
5850
6515
|
// buffer pool for cuda
|
@@ -5915,7 +6580,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5915
6580
|
return ptr;
|
5916
6581
|
}
|
5917
6582
|
#ifdef DEBUG_CUDA_MALLOC
|
5918
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6583
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5919
6584
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5920
6585
|
#endif
|
5921
6586
|
void * ptr;
|
@@ -6053,7 +6718,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6053
6718
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6054
6719
|
// This can fixed the OOM error in WSL.
|
6055
6720
|
cudaGetLastError();
|
6056
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6721
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6057
6722
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6058
6723
|
return nullptr;
|
6059
6724
|
}
|
@@ -6098,75 +6763,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6098
6763
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6099
6764
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6100
6765
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6101
|
-
}
|
6102
|
-
if (nb0 == ts) {
|
6766
|
+
} else if (nb0 == ts) {
|
6103
6767
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6104
|
-
}
|
6105
|
-
|
6106
|
-
|
6107
|
-
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
}
|
6112
|
-
return cudaSuccess;
|
6113
|
-
}
|
6114
|
-
|
6115
|
-
static void ggml_cuda_op_repeat(
|
6116
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6117
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6118
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6119
|
-
const int64_t ne0 = dst->ne[0];
|
6120
|
-
const int64_t ne1 = dst->ne[1];
|
6121
|
-
const int64_t ne2 = dst->ne[2];
|
6122
|
-
const int64_t ne3 = dst->ne[3];
|
6123
|
-
|
6124
|
-
const int64_t ne00 = src0->ne[0];
|
6125
|
-
const int64_t ne01 = src0->ne[1];
|
6126
|
-
const int64_t ne02 = src0->ne[2];
|
6127
|
-
const int64_t ne03 = src0->ne[3];
|
6128
|
-
|
6129
|
-
const size_t nb0 = dst->nb[0];
|
6130
|
-
const size_t nb1 = dst->nb[1];
|
6131
|
-
const size_t nb2 = dst->nb[2];
|
6132
|
-
const size_t nb3 = dst->nb[3];
|
6133
|
-
|
6134
|
-
const size_t nb00 = src0->nb[0];
|
6135
|
-
const size_t nb01 = src0->nb[1];
|
6136
|
-
const size_t nb02 = src0->nb[2];
|
6137
|
-
const size_t nb03 = src0->nb[3];
|
6138
|
-
|
6139
|
-
const int nr0 = (int)(ne0/ne00);
|
6140
|
-
const int nr1 = (int)(ne1/ne01);
|
6141
|
-
const int nr2 = (int)(ne2/ne02);
|
6142
|
-
const int nr3 = (int)(ne3/ne03);
|
6143
|
-
|
6144
|
-
// TODO: support for transposed / permuted tensors
|
6145
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6146
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6147
|
-
|
6148
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6149
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6150
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6151
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6152
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6153
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6154
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6155
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6156
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6157
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6158
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6159
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6160
|
-
}
|
6161
|
-
}
|
6162
|
-
}
|
6163
|
-
}
|
6164
|
-
}
|
6768
|
+
} else {
|
6769
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6770
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6771
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6772
|
+
// pretend the row is a matrix with cols=1
|
6773
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6774
|
+
if (r != cudaSuccess) return r;
|
6165
6775
|
}
|
6776
|
+
return cudaSuccess;
|
6166
6777
|
}
|
6167
|
-
|
6168
|
-
(void) src1;
|
6169
|
-
(void) src1_d;
|
6170
6778
|
}
|
6171
6779
|
|
6172
6780
|
static void ggml_cuda_op_get_rows(
|
@@ -6175,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
|
|
6175
6783
|
|
6176
6784
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6177
6785
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
6178
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
6179
|
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
6180
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
6181
6786
|
|
6182
|
-
|
6183
|
-
|
6787
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
6788
|
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
6789
|
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
6184
6790
|
|
6185
6791
|
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
6186
6792
|
|
6187
6793
|
switch (src0->type) {
|
6188
6794
|
case GGML_TYPE_F16:
|
6189
|
-
|
6795
|
+
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
6190
6796
|
break;
|
6191
6797
|
case GGML_TYPE_F32:
|
6192
|
-
|
6798
|
+
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6193
6799
|
break;
|
6194
6800
|
case GGML_TYPE_Q4_0:
|
6195
|
-
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(
|
6801
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6196
6802
|
break;
|
6197
6803
|
case GGML_TYPE_Q4_1:
|
6198
|
-
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(
|
6804
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6199
6805
|
break;
|
6200
6806
|
case GGML_TYPE_Q5_0:
|
6201
|
-
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(
|
6807
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6202
6808
|
break;
|
6203
6809
|
case GGML_TYPE_Q5_1:
|
6204
|
-
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(
|
6810
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6205
6811
|
break;
|
6206
6812
|
case GGML_TYPE_Q8_0:
|
6207
|
-
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(
|
6813
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6208
6814
|
break;
|
6209
6815
|
default:
|
6210
6816
|
// TODO: k-quants
|
@@ -6213,46 +6819,76 @@ static void ggml_cuda_op_get_rows(
|
|
6213
6819
|
}
|
6214
6820
|
}
|
6215
6821
|
|
6216
|
-
|
6822
|
+
template<class op>
|
6823
|
+
inline void ggml_cuda_op_bin_bcast(
|
6217
6824
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6218
6825
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6219
6826
|
|
6220
6827
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6221
6828
|
|
6222
|
-
const int64_t ne10 = src1->ne[0];
|
6223
|
-
const int64_t ne11 = src1->ne[1];
|
6224
|
-
|
6225
6829
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6226
|
-
|
6830
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6227
6831
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6228
|
-
|
6832
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6229
6833
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6230
|
-
|
6834
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6231
6835
|
} else {
|
6232
|
-
fprintf(stderr, "src0
|
6836
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6837
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6233
6838
|
GGML_ASSERT(false);
|
6234
6839
|
}
|
6840
|
+
}
|
6841
|
+
|
6842
|
+
static void ggml_cuda_op_repeat(
|
6843
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6844
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6845
|
+
|
6846
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6235
6847
|
|
6236
6848
|
(void) src1;
|
6237
|
-
(void)
|
6849
|
+
(void) src1_d;
|
6238
6850
|
}
|
6239
6851
|
|
6240
|
-
inline void
|
6852
|
+
inline void ggml_cuda_op_add(
|
6853
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6854
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6855
|
+
|
6856
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6857
|
+
}
|
6858
|
+
|
6859
|
+
inline void ggml_cuda_op_acc(
|
6241
6860
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6242
6861
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6243
6862
|
|
6244
6863
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6245
6864
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6246
6865
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6866
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
6247
6867
|
|
6248
|
-
|
6249
|
-
|
6868
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
6869
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
6870
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
6871
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
6250
6872
|
|
6251
|
-
|
6873
|
+
acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
6252
6874
|
|
6253
6875
|
(void) dst;
|
6254
6876
|
}
|
6255
6877
|
|
6878
|
+
inline void ggml_cuda_op_mul(
|
6879
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6880
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6881
|
+
|
6882
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6883
|
+
}
|
6884
|
+
|
6885
|
+
inline void ggml_cuda_op_div(
|
6886
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6887
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6888
|
+
|
6889
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6890
|
+
}
|
6891
|
+
|
6256
6892
|
inline void ggml_cuda_op_gelu(
|
6257
6893
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6258
6894
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6281,6 +6917,34 @@ inline void ggml_cuda_op_silu(
|
|
6281
6917
|
(void) src1_dd;
|
6282
6918
|
}
|
6283
6919
|
|
6920
|
+
inline void ggml_cuda_op_gelu_quick(
|
6921
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6922
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6923
|
+
|
6924
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6925
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6926
|
+
|
6927
|
+
gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6928
|
+
|
6929
|
+
(void) src1;
|
6930
|
+
(void) dst;
|
6931
|
+
(void) src1_dd;
|
6932
|
+
}
|
6933
|
+
|
6934
|
+
inline void ggml_cuda_op_tanh(
|
6935
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6936
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6937
|
+
|
6938
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6939
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6940
|
+
|
6941
|
+
tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6942
|
+
|
6943
|
+
(void) src1;
|
6944
|
+
(void) dst;
|
6945
|
+
(void) src1_dd;
|
6946
|
+
}
|
6947
|
+
|
6284
6948
|
inline void ggml_cuda_op_relu(
|
6285
6949
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6286
6950
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6295,38 +6959,38 @@ inline void ggml_cuda_op_relu(
|
|
6295
6959
|
(void) src1_dd;
|
6296
6960
|
}
|
6297
6961
|
|
6298
|
-
inline void
|
6962
|
+
inline void ggml_cuda_op_leaky_relu(
|
6299
6963
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6300
6964
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6301
6965
|
|
6302
6966
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6303
6967
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6304
6968
|
|
6305
|
-
|
6969
|
+
float negative_slope;
|
6970
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
6971
|
+
|
6972
|
+
leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
6306
6973
|
|
6307
6974
|
(void) src1;
|
6308
6975
|
(void) dst;
|
6309
6976
|
(void) src1_dd;
|
6310
6977
|
}
|
6311
6978
|
|
6312
|
-
inline void
|
6979
|
+
inline void ggml_cuda_op_sqr(
|
6313
6980
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6314
6981
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6315
6982
|
|
6316
6983
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6317
6984
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6318
6985
|
|
6319
|
-
|
6320
|
-
const int64_t nrows = ggml_nrows(src0);
|
6321
|
-
|
6322
|
-
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
6986
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6323
6987
|
|
6324
6988
|
(void) src1;
|
6325
6989
|
(void) dst;
|
6326
6990
|
(void) src1_dd;
|
6327
6991
|
}
|
6328
6992
|
|
6329
|
-
inline void
|
6993
|
+
inline void ggml_cuda_op_norm(
|
6330
6994
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6331
6995
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6332
6996
|
|
@@ -6339,26 +7003,111 @@ inline void ggml_cuda_op_rms_norm(
|
|
6339
7003
|
float eps;
|
6340
7004
|
memcpy(&eps, dst->op_params, sizeof(float));
|
6341
7005
|
|
6342
|
-
|
7006
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6343
7007
|
|
6344
7008
|
(void) src1;
|
6345
7009
|
(void) dst;
|
6346
7010
|
(void) src1_dd;
|
6347
7011
|
}
|
6348
7012
|
|
6349
|
-
inline void ggml_cuda_op_mul_mat_q(
|
6350
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
6351
|
-
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6352
|
-
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6353
7013
|
|
6354
|
-
|
7014
|
+
inline void ggml_cuda_op_group_norm(
|
7015
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7016
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6355
7017
|
|
6356
|
-
|
6357
|
-
GGML_ASSERT(
|
7018
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7019
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6358
7020
|
|
6359
|
-
|
7021
|
+
int num_groups = dst->op_params[0];
|
7022
|
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
7023
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
6360
7024
|
|
6361
|
-
|
7025
|
+
(void) src1;
|
7026
|
+
(void) dst;
|
7027
|
+
(void) src1_dd;
|
7028
|
+
}
|
7029
|
+
|
7030
|
+
inline void ggml_cuda_op_concat(
|
7031
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7032
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7033
|
+
|
7034
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7035
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7036
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7037
|
+
|
7038
|
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
7039
|
+
concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
7040
|
+
}
|
7041
|
+
|
7042
|
+
(void) src1;
|
7043
|
+
(void) dst;
|
7044
|
+
}
|
7045
|
+
|
7046
|
+
inline void ggml_cuda_op_upscale(
|
7047
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7048
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7049
|
+
|
7050
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7051
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7052
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7053
|
+
|
7054
|
+
const int scale_factor = dst->op_params[0];
|
7055
|
+
|
7056
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
7057
|
+
|
7058
|
+
(void) src1;
|
7059
|
+
(void) dst;
|
7060
|
+
}
|
7061
|
+
|
7062
|
+
inline void ggml_cuda_op_pad(
|
7063
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7064
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7065
|
+
|
7066
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7067
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7068
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7069
|
+
|
7070
|
+
pad_f32_cuda(src0_dd, dst_dd,
|
7071
|
+
src0->ne[0], src0->ne[1], src0->ne[2],
|
7072
|
+
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
7073
|
+
|
7074
|
+
(void) src1;
|
7075
|
+
(void) dst;
|
7076
|
+
}
|
7077
|
+
|
7078
|
+
inline void ggml_cuda_op_rms_norm(
|
7079
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7080
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7081
|
+
|
7082
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7083
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7084
|
+
|
7085
|
+
const int64_t ne00 = src0->ne[0];
|
7086
|
+
const int64_t nrows = ggml_nrows(src0);
|
7087
|
+
|
7088
|
+
float eps;
|
7089
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
7090
|
+
|
7091
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
7092
|
+
|
7093
|
+
(void) src1;
|
7094
|
+
(void) dst;
|
7095
|
+
(void) src1_dd;
|
7096
|
+
}
|
7097
|
+
|
7098
|
+
inline void ggml_cuda_op_mul_mat_q(
|
7099
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7100
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7101
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
7102
|
+
|
7103
|
+
const int64_t ne00 = src0->ne[0];
|
7104
|
+
|
7105
|
+
const int64_t ne10 = src1->ne[0];
|
7106
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
7107
|
+
|
7108
|
+
const int64_t ne0 = dst->ne[0];
|
7109
|
+
|
7110
|
+
const int64_t row_diff = row_high - row_low;
|
6362
7111
|
|
6363
7112
|
int id;
|
6364
7113
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -6474,6 +7223,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6474
7223
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6475
7224
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6476
7225
|
|
7226
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
7227
|
+
|
6477
7228
|
const int64_t ne00 = src0->ne[0];
|
6478
7229
|
const int64_t row_diff = row_high - row_low;
|
6479
7230
|
|
@@ -6533,7 +7284,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6533
7284
|
size_t ash;
|
6534
7285
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6535
7286
|
|
6536
|
-
bool src1_convert_f16 =
|
7287
|
+
bool src1_convert_f16 =
|
7288
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6537
7289
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6538
7290
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6539
7291
|
|
@@ -6837,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
|
|
6837
7589
|
|
6838
7590
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6839
7591
|
|
6840
|
-
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6841
7592
|
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6842
7593
|
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6843
7594
|
const int64_t IW = src1->ne[0];
|
@@ -6848,17 +7599,51 @@ inline void ggml_cuda_op_im2col(
|
|
6848
7599
|
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6849
7600
|
const int64_t OW = dst->ne[1];
|
6850
7601
|
|
6851
|
-
const size_t
|
6852
|
-
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7602
|
+
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6853
7603
|
|
6854
|
-
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6855
|
-
OH, IW, IH, OW, IC, KH, KW, N,
|
6856
|
-
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
7604
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
6857
7605
|
|
6858
7606
|
(void) src0;
|
6859
7607
|
(void) src0_dd;
|
6860
7608
|
}
|
6861
7609
|
|
7610
|
+
|
7611
|
+
inline void ggml_cuda_op_sum_rows(
|
7612
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7613
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7614
|
+
|
7615
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7616
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7617
|
+
|
7618
|
+
const int64_t ncols = src0->ne[0];
|
7619
|
+
const int64_t nrows = ggml_nrows(src0);
|
7620
|
+
|
7621
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7622
|
+
|
7623
|
+
(void) src1;
|
7624
|
+
(void) dst;
|
7625
|
+
(void) src1_dd;
|
7626
|
+
}
|
7627
|
+
|
7628
|
+
inline void ggml_cuda_op_argsort(
|
7629
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7630
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7631
|
+
|
7632
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7633
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7634
|
+
|
7635
|
+
const int64_t ncols = src0->ne[0];
|
7636
|
+
const int64_t nrows = ggml_nrows(src0);
|
7637
|
+
|
7638
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7639
|
+
|
7640
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7641
|
+
|
7642
|
+
(void) src1;
|
7643
|
+
(void) dst;
|
7644
|
+
(void) src1_dd;
|
7645
|
+
}
|
7646
|
+
|
6862
7647
|
inline void ggml_cuda_op_diag_mask_inf(
|
6863
7648
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6864
7649
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7067,7 +7852,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7067
7852
|
const int64_t ne01 = src0->ne[1];
|
7068
7853
|
const int64_t ne02 = src0->ne[2];
|
7069
7854
|
const int64_t ne03 = src0->ne[3];
|
7070
|
-
|
7855
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7071
7856
|
|
7072
7857
|
const int64_t ne10 = src1->ne[0];
|
7073
7858
|
const int64_t ne11 = src1->ne[1];
|
@@ -7103,10 +7888,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7103
7888
|
|
7104
7889
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7105
7890
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7106
|
-
|
7107
7891
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7108
|
-
|
7109
|
-
|
7892
|
+
|
7893
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7110
7894
|
|
7111
7895
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7112
7896
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7231,7 +8015,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7231
8015
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7232
8016
|
|
7233
8017
|
// for split tensors the data begins at i0 == i0_offset_low
|
7234
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
8018
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7235
8019
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7236
8020
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7237
8021
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7372,10 +8156,18 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7372
8156
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
7373
8157
|
}
|
7374
8158
|
|
8159
|
+
static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8160
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
|
8161
|
+
}
|
8162
|
+
|
7375
8163
|
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7376
8164
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7377
8165
|
}
|
7378
8166
|
|
8167
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8168
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
8169
|
+
}
|
8170
|
+
|
7379
8171
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7380
8172
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7381
8173
|
}
|
@@ -7384,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7384
8176
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7385
8177
|
}
|
7386
8178
|
|
8179
|
+
static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8180
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
|
8181
|
+
}
|
8182
|
+
|
8183
|
+
static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8184
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
|
8185
|
+
}
|
8186
|
+
|
7387
8187
|
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7388
8188
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7389
8189
|
}
|
7390
8190
|
|
8191
|
+
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8192
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
8193
|
+
}
|
8194
|
+
|
7391
8195
|
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7392
8196
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7393
8197
|
}
|
@@ -7396,12 +8200,28 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7396
8200
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7397
8201
|
}
|
7398
8202
|
|
8203
|
+
static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8204
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
|
8205
|
+
}
|
8206
|
+
|
8207
|
+
static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8208
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
|
8209
|
+
}
|
8210
|
+
|
8211
|
+
static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8212
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
|
8213
|
+
}
|
8214
|
+
|
8215
|
+
static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8216
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
8217
|
+
}
|
8218
|
+
|
7399
8219
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7400
8220
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
7401
8221
|
}
|
7402
8222
|
|
7403
8223
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7404
|
-
if (!g_cublas_loaded)
|
8224
|
+
if (!g_cublas_loaded) return false;
|
7405
8225
|
|
7406
8226
|
const int64_t ne10 = src1->ne[0];
|
7407
8227
|
|
@@ -7479,7 +8299,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7479
8299
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7480
8300
|
}
|
7481
8301
|
|
7482
|
-
__global__
|
8302
|
+
static __global__ void k_compute_batched_ptrs(
|
7483
8303
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7484
8304
|
const void ** ptrs_src, void ** ptrs_dst,
|
7485
8305
|
int ne12, int ne13,
|
@@ -7535,9 +8355,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7535
8355
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7536
8356
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7537
8357
|
|
7538
|
-
|
7539
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7540
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
8358
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7541
8359
|
|
7542
8360
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7543
8361
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7594,7 +8412,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7594
8412
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7595
8413
|
// use cublasGemmStridedBatchedEx
|
7596
8414
|
CUBLAS_CHECK(
|
7597
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
8415
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7598
8416
|
ne01, ne11, ne10,
|
7599
8417
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7600
8418
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7628,7 +8446,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7628
8446
|
CUDA_CHECK(cudaGetLastError());
|
7629
8447
|
|
7630
8448
|
CUBLAS_CHECK(
|
7631
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
8449
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7632
8450
|
ne01, ne11, ne10,
|
7633
8451
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7634
8452
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7698,10 +8516,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7698
8516
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7699
8517
|
const bool use_mul_mat_vec_q = false;
|
7700
8518
|
#else
|
7701
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8519
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7702
8520
|
#endif // GGML_CUDA_FORCE_DMMV
|
7703
8521
|
|
7704
8522
|
if (use_mul_mat_vec_q) {
|
8523
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7705
8524
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7706
8525
|
} else {
|
7707
8526
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7726,6 +8545,252 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7726
8545
|
}
|
7727
8546
|
}
|
7728
8547
|
|
8548
|
+
#if 0
|
8549
|
+
template<typename ... Srcs>
|
8550
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8551
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8552
|
+
int ne12, int ne13,
|
8553
|
+
int ne23,
|
8554
|
+
int nb02, int nb03,
|
8555
|
+
int nb12, int nb13,
|
8556
|
+
int nb2, int nb3,
|
8557
|
+
int r2, int r3,
|
8558
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8559
|
+
const half * src1_f16, half * dst_f16,
|
8560
|
+
const int32_t * ids, const int id,
|
8561
|
+
Srcs... src0s) {
|
8562
|
+
|
8563
|
+
int i = ids[id];
|
8564
|
+
|
8565
|
+
half * src0_f16;
|
8566
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8567
|
+
if (src0_type == GGML_TYPE_F16) {
|
8568
|
+
src0_f16 = (half *) srcs_ar[i];
|
8569
|
+
} else {
|
8570
|
+
src0_f16 = src0_as_f16;
|
8571
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8572
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8573
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8574
|
+
}
|
8575
|
+
}
|
8576
|
+
|
8577
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8578
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8579
|
+
|
8580
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8581
|
+
return;
|
8582
|
+
}
|
8583
|
+
|
8584
|
+
int i03 = i13 / r3;
|
8585
|
+
int i02 = i12 / r2;
|
8586
|
+
|
8587
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8588
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8589
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
8590
|
+
}
|
8591
|
+
|
8592
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8593
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8594
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8595
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
8596
|
+
|
8597
|
+
const int id = dst->op_params[0];
|
8598
|
+
|
8599
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8600
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
8601
|
+
|
8602
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8603
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8604
|
+
|
8605
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8606
|
+
const int64_t ne01 = src00->ne[1];
|
8607
|
+
const int64_t ne02 = src00->ne[2];
|
8608
|
+
const int64_t ne03 = src00->ne[3];
|
8609
|
+
|
8610
|
+
//const int64_t nb01 = src00->nb[1];
|
8611
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8612
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
8613
|
+
|
8614
|
+
const int64_t ne10 = src1->ne[0];
|
8615
|
+
const int64_t ne11 = src1->ne[1];
|
8616
|
+
const int64_t ne12 = src1->ne[2];
|
8617
|
+
const int64_t ne13 = src1->ne[3];
|
8618
|
+
|
8619
|
+
//const int64_t nb11 = src1->nb[1];
|
8620
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8621
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8622
|
+
|
8623
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8624
|
+
const int64_t ne = ggml_nelements(dst);
|
8625
|
+
|
8626
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8627
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8628
|
+
|
8629
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8630
|
+
|
8631
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8632
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8633
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8634
|
+
|
8635
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8636
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8637
|
+
|
8638
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8639
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8640
|
+
|
8641
|
+
// convert src1 to fp16
|
8642
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8643
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8644
|
+
|
8645
|
+
size_t src1_as = 0;
|
8646
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8647
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8648
|
+
|
8649
|
+
size_t dst_as = 0;
|
8650
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8651
|
+
|
8652
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8653
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8654
|
+
|
8655
|
+
// broadcast factors
|
8656
|
+
const int64_t r2 = ne12/ne02;
|
8657
|
+
const int64_t r3 = ne13/ne03;
|
8658
|
+
|
8659
|
+
const half alpha_f16 = 1.0f;
|
8660
|
+
const half beta_f16 = 0.0f;
|
8661
|
+
|
8662
|
+
// use cublasGemmBatchedEx
|
8663
|
+
const int ne23 = ne12*ne13;
|
8664
|
+
|
8665
|
+
const void ** ptrs_src = nullptr;
|
8666
|
+
void ** ptrs_dst = nullptr;
|
8667
|
+
|
8668
|
+
size_t ptrs_src_s = 0;
|
8669
|
+
size_t ptrs_dst_s = 0;
|
8670
|
+
|
8671
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8672
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8673
|
+
|
8674
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8675
|
+
half * src0_as_f16 = nullptr;
|
8676
|
+
size_t src0_as = 0;
|
8677
|
+
if (src00->type != GGML_TYPE_F16) {
|
8678
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8679
|
+
}
|
8680
|
+
|
8681
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8682
|
+
dim3 block_dims(ne13, ne12);
|
8683
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8684
|
+
ptrs_src, ptrs_dst,
|
8685
|
+
ne12, ne13,
|
8686
|
+
ne23,
|
8687
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8688
|
+
nb12, nb13,
|
8689
|
+
dst->nb[2], dst->nb[3],
|
8690
|
+
r2, r3,
|
8691
|
+
src00->type, src0_as_f16, src0_ne,
|
8692
|
+
src1_as_f16, dst_f16,
|
8693
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8694
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8695
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8696
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8697
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8698
|
+
);
|
8699
|
+
CUDA_CHECK(cudaGetLastError());
|
8700
|
+
|
8701
|
+
CUBLAS_CHECK(
|
8702
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8703
|
+
ne01, ne11, ne10,
|
8704
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8705
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8706
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8707
|
+
ne23,
|
8708
|
+
CUBLAS_COMPUTE_16F,
|
8709
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8710
|
+
|
8711
|
+
if (src0_as != 0) {
|
8712
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8713
|
+
}
|
8714
|
+
if (ptrs_src_s != 0) {
|
8715
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8716
|
+
}
|
8717
|
+
if (ptrs_dst_s != 0) {
|
8718
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8722
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8723
|
+
|
8724
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8725
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8726
|
+
}
|
8727
|
+
#endif
|
8728
|
+
|
8729
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8730
|
+
#if 0
|
8731
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8732
|
+
// TODO: mmq/mmv support
|
8733
|
+
#endif
|
8734
|
+
|
8735
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8736
|
+
|
8737
|
+
const struct ggml_tensor * ids = src0;
|
8738
|
+
const int32_t id = ((int32_t *) dst->op_params)[0];
|
8739
|
+
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
8740
|
+
|
8741
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
8742
|
+
|
8743
|
+
if (ids->backend == GGML_BACKEND_GPU) {
|
8744
|
+
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8745
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8746
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8747
|
+
} else {
|
8748
|
+
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8749
|
+
}
|
8750
|
+
|
8751
|
+
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
8752
|
+
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
8753
|
+
|
8754
|
+
ggml_tensor_extra_gpu src1_row_extra;
|
8755
|
+
ggml_tensor_extra_gpu dst_row_extra;
|
8756
|
+
|
8757
|
+
ggml_tensor src1_row = *src1;
|
8758
|
+
ggml_tensor dst_row = *dst;
|
8759
|
+
|
8760
|
+
src1_row.ne[1] = 1;
|
8761
|
+
dst_row.ne[1] = 1;
|
8762
|
+
|
8763
|
+
src1_row.nb[2] = src1_row.nb[1];
|
8764
|
+
dst_row.nb[2] = dst_row.nb[1];
|
8765
|
+
|
8766
|
+
src1_row.nb[3] = src1_row.nb[1];
|
8767
|
+
dst_row.nb[3] = dst_row.nb[1];
|
8768
|
+
|
8769
|
+
src1_row.extra = &src1_row_extra;
|
8770
|
+
dst_row.extra = &dst_row_extra;
|
8771
|
+
|
8772
|
+
|
8773
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8774
|
+
//int32_t row_id;
|
8775
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8776
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8777
|
+
|
8778
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8779
|
+
|
8780
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8781
|
+
|
8782
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8783
|
+
|
8784
|
+
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
8785
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
8786
|
+
|
8787
|
+
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
8788
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1];
|
8789
|
+
|
8790
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8791
|
+
}
|
8792
|
+
}
|
8793
|
+
|
7729
8794
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7730
8795
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7731
8796
|
}
|
@@ -7770,14 +8835,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7770
8835
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7771
8836
|
|
7772
8837
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7773
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7774
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8838
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7775
8839
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7776
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7777
|
-
|
8840
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8841
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8842
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8843
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8844
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8845
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8846
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7778
8847
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7779
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7780
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8848
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7781
8849
|
} else {
|
7782
8850
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7783
8851
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7788,6 +8856,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7788
8856
|
}
|
7789
8857
|
|
7790
8858
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8859
|
+
// TODO: why do we pass dst as src1 here?
|
7791
8860
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7792
8861
|
(void) src1;
|
7793
8862
|
}
|
@@ -7813,12 +8882,28 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7813
8882
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7814
8883
|
}
|
7815
8884
|
|
8885
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8886
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8887
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8888
|
+
}
|
8889
|
+
|
8890
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8891
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8892
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8893
|
+
}
|
8894
|
+
|
7816
8895
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7817
8896
|
(void) src0;
|
7818
8897
|
(void) src1;
|
7819
8898
|
(void) dst;
|
7820
8899
|
}
|
7821
8900
|
|
8901
|
+
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
8902
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
8903
|
+
|
8904
|
+
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
8905
|
+
}
|
8906
|
+
|
7822
8907
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
7823
8908
|
const int64_t nrows = ggml_nrows(tensor);
|
7824
8909
|
|
@@ -7868,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7868
8953
|
|
7869
8954
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
7870
8955
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7871
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7872
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8956
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
7873
8957
|
}
|
7874
8958
|
|
7875
8959
|
char * buf;
|
@@ -8068,8 +9152,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8068
9152
|
main_device, g_device_count, g_main_device);
|
8069
9153
|
return;
|
8070
9154
|
}
|
8071
|
-
|
8072
|
-
if (g_device_count > 1) {
|
9155
|
+
|
9156
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
9157
|
+
g_main_device = main_device;
|
8073
9158
|
cudaDeviceProp prop;
|
8074
9159
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8075
9160
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8095,7 +9180,7 @@ void ggml_cuda_free_scratch() {
|
|
8095
9180
|
}
|
8096
9181
|
|
8097
9182
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8098
|
-
if (!g_cublas_loaded)
|
9183
|
+
if (!g_cublas_loaded) return false;
|
8099
9184
|
|
8100
9185
|
ggml_cuda_func_t func;
|
8101
9186
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8128,9 +9213,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8128
9213
|
case GGML_OP_ADD:
|
8129
9214
|
func = ggml_cuda_add;
|
8130
9215
|
break;
|
9216
|
+
case GGML_OP_ACC:
|
9217
|
+
func = ggml_cuda_acc;
|
9218
|
+
break;
|
8131
9219
|
case GGML_OP_MUL:
|
8132
9220
|
func = ggml_cuda_mul;
|
8133
9221
|
break;
|
9222
|
+
case GGML_OP_DIV:
|
9223
|
+
func = ggml_cuda_div;
|
9224
|
+
break;
|
8134
9225
|
case GGML_OP_UNARY:
|
8135
9226
|
switch (ggml_get_unary_op(tensor)) {
|
8136
9227
|
case GGML_UNARY_OP_GELU:
|
@@ -8139,15 +9230,37 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8139
9230
|
case GGML_UNARY_OP_SILU:
|
8140
9231
|
func = ggml_cuda_silu;
|
8141
9232
|
break;
|
9233
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9234
|
+
func = ggml_cuda_gelu_quick;
|
9235
|
+
break;
|
9236
|
+
case GGML_UNARY_OP_TANH:
|
9237
|
+
func = ggml_cuda_tanh;
|
9238
|
+
break;
|
8142
9239
|
case GGML_UNARY_OP_RELU:
|
8143
9240
|
func = ggml_cuda_relu;
|
8144
9241
|
break;
|
8145
9242
|
default:
|
8146
9243
|
return false;
|
8147
|
-
}
|
9244
|
+
}
|
9245
|
+
break;
|
8148
9246
|
case GGML_OP_NORM:
|
8149
9247
|
func = ggml_cuda_norm;
|
8150
9248
|
break;
|
9249
|
+
case GGML_OP_GROUP_NORM:
|
9250
|
+
func = ggml_cuda_group_norm;
|
9251
|
+
break;
|
9252
|
+
case GGML_OP_CONCAT:
|
9253
|
+
func = ggml_cuda_concat;
|
9254
|
+
break;
|
9255
|
+
case GGML_OP_UPSCALE:
|
9256
|
+
func = ggml_cuda_upscale;
|
9257
|
+
break;
|
9258
|
+
case GGML_OP_PAD:
|
9259
|
+
func = ggml_cuda_pad;
|
9260
|
+
break;
|
9261
|
+
case GGML_OP_LEAKY_RELU:
|
9262
|
+
func = ggml_cuda_leaky_relu;
|
9263
|
+
break;
|
8151
9264
|
case GGML_OP_RMS_NORM:
|
8152
9265
|
func = ggml_cuda_rms_norm;
|
8153
9266
|
break;
|
@@ -8157,6 +9270,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8157
9270
|
}
|
8158
9271
|
func = ggml_cuda_mul_mat;
|
8159
9272
|
break;
|
9273
|
+
case GGML_OP_MUL_MAT_ID:
|
9274
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
9275
|
+
return false;
|
9276
|
+
}
|
9277
|
+
func = ggml_cuda_mul_mat_id;
|
9278
|
+
break;
|
8160
9279
|
case GGML_OP_SCALE:
|
8161
9280
|
func = ggml_cuda_scale;
|
8162
9281
|
break;
|
@@ -8164,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8164
9283
|
func = ggml_cuda_sqr;
|
8165
9284
|
break;
|
8166
9285
|
case GGML_OP_CLAMP:
|
8167
|
-
if (!any_on_device) {
|
8168
|
-
return false;
|
8169
|
-
}
|
8170
9286
|
func = ggml_cuda_clamp;
|
8171
9287
|
break;
|
8172
9288
|
case GGML_OP_CPY:
|
@@ -8175,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8175
9291
|
case GGML_OP_CONT:
|
8176
9292
|
func = ggml_cuda_dup;
|
8177
9293
|
break;
|
9294
|
+
case GGML_OP_NONE:
|
8178
9295
|
case GGML_OP_RESHAPE:
|
8179
9296
|
case GGML_OP_VIEW:
|
8180
9297
|
case GGML_OP_PERMUTE:
|
@@ -8196,6 +9313,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8196
9313
|
case GGML_OP_IM2COL:
|
8197
9314
|
func = ggml_cuda_im2col;
|
8198
9315
|
break;
|
9316
|
+
case GGML_OP_SUM_ROWS:
|
9317
|
+
func = ggml_cuda_sum_rows;
|
9318
|
+
break;
|
9319
|
+
case GGML_OP_ARGSORT:
|
9320
|
+
func = ggml_cuda_argsort;
|
9321
|
+
break;
|
8199
9322
|
default:
|
8200
9323
|
return false;
|
8201
9324
|
}
|
@@ -8212,7 +9335,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8212
9335
|
|
8213
9336
|
int ggml_cuda_get_device_count() {
|
8214
9337
|
int device_count;
|
8215
|
-
|
9338
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
9339
|
+
return 0;
|
9340
|
+
}
|
8216
9341
|
return device_count;
|
8217
9342
|
}
|
8218
9343
|
|
@@ -8228,27 +9353,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8228
9353
|
|
8229
9354
|
#define UNUSED GGML_UNUSED
|
8230
9355
|
|
8231
|
-
|
8232
|
-
};
|
8233
|
-
|
8234
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8235
|
-
return GGML_CUDA_NAME;
|
8236
|
-
|
8237
|
-
UNUSED(backend);
|
8238
|
-
}
|
8239
|
-
|
8240
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8241
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8242
|
-
delete cuda_ctx;
|
8243
|
-
delete backend;
|
8244
|
-
}
|
9356
|
+
// cuda buffer
|
8245
9357
|
|
8246
9358
|
struct ggml_backend_buffer_context_cuda {
|
8247
|
-
|
8248
|
-
|
9359
|
+
int device;
|
9360
|
+
void * dev_ptr = nullptr;
|
8249
9361
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8250
9362
|
size_t temp_tensor_extra_index = 0;
|
8251
9363
|
|
9364
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
9365
|
+
|
8252
9366
|
~ggml_backend_buffer_context_cuda() {
|
8253
9367
|
delete[] temp_tensor_extras;
|
8254
9368
|
}
|
@@ -8269,41 +9383,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8269
9383
|
|
8270
9384
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8271
9385
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8272
|
-
CUDA_CHECK(cudaFree(ctx->
|
9386
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8273
9387
|
delete ctx;
|
8274
9388
|
}
|
8275
9389
|
|
8276
9390
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8277
9391
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8278
|
-
return ctx->
|
8279
|
-
}
|
8280
|
-
|
8281
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8282
|
-
int64_t row_low = 0;
|
8283
|
-
int64_t row_high = ggml_nrows(tensor);
|
8284
|
-
int64_t nrows_split = row_high - row_low;
|
8285
|
-
|
8286
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8287
|
-
|
8288
|
-
int64_t ne0 = tensor->ne[0];
|
8289
|
-
|
8290
|
-
if (ggml_is_quantized(tensor->type)) {
|
8291
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8292
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8293
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8294
|
-
}
|
8295
|
-
}
|
8296
|
-
|
8297
|
-
return size;
|
8298
|
-
|
8299
|
-
UNUSED(buffer);
|
9392
|
+
return ctx->dev_ptr;
|
8300
9393
|
}
|
8301
9394
|
|
8302
9395
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8303
9396
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8304
9397
|
|
8305
9398
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8306
|
-
assert(tensor->view_src->buffer->
|
9399
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8307
9400
|
tensor->backend = tensor->view_src->backend;
|
8308
9401
|
tensor->extra = tensor->view_src->extra;
|
8309
9402
|
return;
|
@@ -8311,7 +9404,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8311
9404
|
|
8312
9405
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8313
9406
|
|
8314
|
-
extra->data_device[
|
9407
|
+
extra->data_device[ctx->device] = tensor->data;
|
8315
9408
|
|
8316
9409
|
tensor->backend = GGML_BACKEND_GPU;
|
8317
9410
|
tensor->extra = extra;
|
@@ -8323,64 +9416,207 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8323
9416
|
int64_t nrows_split = row_high - row_low;
|
8324
9417
|
|
8325
9418
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8326
|
-
size_t padded_size =
|
9419
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8327
9420
|
|
8328
9421
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8329
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
9422
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8330
9423
|
}
|
8331
9424
|
}
|
8332
9425
|
|
8333
9426
|
UNUSED(buffer);
|
8334
9427
|
}
|
8335
9428
|
|
9429
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9430
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9431
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9432
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9433
|
+
|
9434
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
9435
|
+
|
9436
|
+
UNUSED(buffer);
|
9437
|
+
}
|
9438
|
+
|
9439
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9440
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9441
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9442
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9443
|
+
|
9444
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
9445
|
+
|
9446
|
+
UNUSED(buffer);
|
9447
|
+
}
|
9448
|
+
|
8336
9449
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8337
|
-
/* .free_buffer
|
8338
|
-
/* .get_base
|
8339
|
-
/* .
|
8340
|
-
/* .
|
8341
|
-
/* .
|
9450
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
9451
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
9452
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
9453
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
9454
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
9455
|
+
/* .cpy_tensor_from = */ NULL,
|
9456
|
+
/* .cpy_tensor_to = */ NULL,
|
8342
9457
|
};
|
8343
9458
|
|
8344
|
-
|
8345
|
-
ggml_cuda_set_device(g_main_device);
|
9459
|
+
// cuda buffer type
|
8346
9460
|
|
8347
|
-
|
9461
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
9462
|
+
int device = (int) (intptr_t) buft->context;
|
9463
|
+
|
9464
|
+
ggml_cuda_set_device(device);
|
8348
9465
|
|
8349
9466
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8350
9467
|
|
8351
|
-
|
8352
|
-
CUDA_CHECK(cudaMalloc(&
|
9468
|
+
void * dev_ptr;
|
9469
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8353
9470
|
|
8354
|
-
|
9471
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
9472
|
+
|
9473
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8355
9474
|
}
|
8356
9475
|
|
8357
|
-
static size_t
|
9476
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8358
9477
|
return 128;
|
9478
|
+
|
9479
|
+
UNUSED(buft);
|
9480
|
+
}
|
9481
|
+
|
9482
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
9483
|
+
int64_t row_low = 0;
|
9484
|
+
int64_t row_high = ggml_nrows(tensor);
|
9485
|
+
int64_t nrows_split = row_high - row_low;
|
9486
|
+
|
9487
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
9488
|
+
|
9489
|
+
int64_t ne0 = tensor->ne[0];
|
9490
|
+
|
9491
|
+
if (ggml_is_quantized(tensor->type)) {
|
9492
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
9493
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
9494
|
+
}
|
9495
|
+
}
|
9496
|
+
|
9497
|
+
return size;
|
9498
|
+
|
9499
|
+
UNUSED(buft);
|
9500
|
+
}
|
9501
|
+
|
9502
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
9503
|
+
return ggml_backend_is_cuda(backend);
|
9504
|
+
|
9505
|
+
UNUSED(buft);
|
9506
|
+
}
|
9507
|
+
|
9508
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
9509
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
9510
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
9511
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
9512
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
9513
|
+
};
|
9514
|
+
|
9515
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
9516
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
9517
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
9518
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
9519
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
9520
|
+
ggml_backend_buffer_type_cuda[i] = {
|
9521
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
9522
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
9523
|
+
};
|
9524
|
+
}
|
9525
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
9526
|
+
}
|
9527
|
+
|
9528
|
+
return &ggml_backend_buffer_type_cuda[device];
|
9529
|
+
}
|
9530
|
+
|
9531
|
+
// host buffer type
|
9532
|
+
|
9533
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
9534
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9535
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
9536
|
+
delete ctx;
|
9537
|
+
}
|
9538
|
+
|
9539
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
9540
|
+
void * ptr;
|
9541
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
9542
|
+
|
9543
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
9544
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
9545
|
+
buffer->buft = buft;
|
9546
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
9547
|
+
|
9548
|
+
return buffer;
|
9549
|
+
|
9550
|
+
UNUSED(buft);
|
9551
|
+
}
|
9552
|
+
|
9553
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9554
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9555
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9556
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9557
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9558
|
+
};
|
9559
|
+
|
9560
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9561
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9562
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9563
|
+
/* .context = */ nullptr,
|
9564
|
+
};
|
9565
|
+
|
9566
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9567
|
+
}
|
9568
|
+
|
9569
|
+
// backend
|
9570
|
+
|
9571
|
+
struct ggml_backend_context_cuda {
|
9572
|
+
int device;
|
9573
|
+
};
|
9574
|
+
|
9575
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9576
|
+
return GGML_CUDA_NAME;
|
9577
|
+
|
8359
9578
|
UNUSED(backend);
|
8360
9579
|
}
|
8361
9580
|
|
9581
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9582
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9583
|
+
|
9584
|
+
delete cuda_ctx;
|
9585
|
+
delete backend;
|
9586
|
+
}
|
9587
|
+
|
9588
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9589
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9590
|
+
|
9591
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9592
|
+
}
|
9593
|
+
|
8362
9594
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9595
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9596
|
+
|
9597
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8363
9598
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8364
9599
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8365
9600
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8366
9601
|
|
8367
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8368
|
-
|
8369
|
-
UNUSED(backend);
|
9602
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8370
9603
|
}
|
8371
9604
|
|
8372
9605
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9606
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9607
|
+
|
9608
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8373
9609
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8374
9610
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8375
9611
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8376
9612
|
|
8377
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8378
|
-
|
8379
|
-
UNUSED(backend);
|
9613
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8380
9614
|
}
|
8381
9615
|
|
8382
9616
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8383
|
-
|
9617
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9618
|
+
|
9619
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8384
9620
|
|
8385
9621
|
UNUSED(backend);
|
8386
9622
|
}
|
@@ -8394,14 +9630,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8394
9630
|
UNUSED(cgraph);
|
8395
9631
|
}
|
8396
9632
|
|
8397
|
-
|
9633
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8398
9634
|
GGML_ASSERT(!"not implemented");
|
8399
9635
|
|
8400
9636
|
UNUSED(backend);
|
8401
9637
|
UNUSED(plan);
|
8402
9638
|
}
|
8403
9639
|
|
8404
|
-
|
9640
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8405
9641
|
GGML_ASSERT(!"not implemented");
|
8406
9642
|
|
8407
9643
|
UNUSED(backend);
|
@@ -8409,7 +9645,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8409
9645
|
}
|
8410
9646
|
|
8411
9647
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8412
|
-
|
9648
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9649
|
+
|
9650
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8413
9651
|
|
8414
9652
|
ggml_compute_params params = {};
|
8415
9653
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8417,13 +9655,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8417
9655
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8418
9656
|
ggml_tensor * node = cgraph->nodes[i];
|
8419
9657
|
|
8420
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9658
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8421
9659
|
continue;
|
8422
|
-
|
9660
|
+
|
8423
9661
|
assert(node->backend == GGML_BACKEND_GPU);
|
9662
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9663
|
+
assert(node->extra != nullptr);
|
9664
|
+
|
8424
9665
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8425
9666
|
if (node->src[j] != nullptr) {
|
8426
9667
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9668
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9669
|
+
assert(node->src[j]->extra != nullptr);
|
8427
9670
|
}
|
8428
9671
|
}
|
8429
9672
|
|
@@ -8460,27 +9703,143 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8460
9703
|
UNUSED(backend);
|
8461
9704
|
}
|
8462
9705
|
|
9706
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9707
|
+
switch (op->op) {
|
9708
|
+
case GGML_OP_UNARY:
|
9709
|
+
switch (ggml_get_unary_op(op)) {
|
9710
|
+
case GGML_UNARY_OP_GELU:
|
9711
|
+
case GGML_UNARY_OP_SILU:
|
9712
|
+
case GGML_UNARY_OP_RELU:
|
9713
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9714
|
+
case GGML_UNARY_OP_TANH:
|
9715
|
+
return true;
|
9716
|
+
default:
|
9717
|
+
return false;
|
9718
|
+
}
|
9719
|
+
break;
|
9720
|
+
case GGML_OP_MUL_MAT:
|
9721
|
+
case GGML_OP_MUL_MAT_ID:
|
9722
|
+
{
|
9723
|
+
struct ggml_tensor * a;
|
9724
|
+
struct ggml_tensor * b;
|
9725
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9726
|
+
a = op->src[0];
|
9727
|
+
b = op->src[1];
|
9728
|
+
} else {
|
9729
|
+
a = op->src[2];
|
9730
|
+
b = op->src[1];
|
9731
|
+
}
|
9732
|
+
if (a->ne[3] != b->ne[3]) {
|
9733
|
+
return false;
|
9734
|
+
}
|
9735
|
+
return true;
|
9736
|
+
} break;
|
9737
|
+
case GGML_OP_GET_ROWS:
|
9738
|
+
{
|
9739
|
+
switch (op->src[0]->type) {
|
9740
|
+
case GGML_TYPE_F16:
|
9741
|
+
case GGML_TYPE_F32:
|
9742
|
+
case GGML_TYPE_Q4_0:
|
9743
|
+
case GGML_TYPE_Q4_1:
|
9744
|
+
case GGML_TYPE_Q5_0:
|
9745
|
+
case GGML_TYPE_Q5_1:
|
9746
|
+
case GGML_TYPE_Q8_0:
|
9747
|
+
return true;
|
9748
|
+
default:
|
9749
|
+
return false;
|
9750
|
+
}
|
9751
|
+
} break;
|
9752
|
+
case GGML_OP_CPY:
|
9753
|
+
{
|
9754
|
+
ggml_type src0_type = op->src[0]->type;
|
9755
|
+
ggml_type src1_type = op->src[1]->type;
|
9756
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
9757
|
+
return true;
|
9758
|
+
}
|
9759
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
9760
|
+
return true;
|
9761
|
+
}
|
9762
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
9763
|
+
return true;
|
9764
|
+
}
|
9765
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
9766
|
+
return true;
|
9767
|
+
}
|
9768
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
9769
|
+
return true;
|
9770
|
+
}
|
9771
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
9772
|
+
return true;
|
9773
|
+
}
|
9774
|
+
return false;
|
9775
|
+
} break;
|
9776
|
+
case GGML_OP_NONE:
|
9777
|
+
case GGML_OP_RESHAPE:
|
9778
|
+
case GGML_OP_VIEW:
|
9779
|
+
case GGML_OP_PERMUTE:
|
9780
|
+
case GGML_OP_TRANSPOSE:
|
9781
|
+
case GGML_OP_NORM:
|
9782
|
+
case GGML_OP_REPEAT:
|
9783
|
+
case GGML_OP_DUP:
|
9784
|
+
case GGML_OP_ADD:
|
9785
|
+
case GGML_OP_MUL:
|
9786
|
+
case GGML_OP_DIV:
|
9787
|
+
case GGML_OP_RMS_NORM:
|
9788
|
+
case GGML_OP_SCALE:
|
9789
|
+
case GGML_OP_SQR:
|
9790
|
+
case GGML_OP_CLAMP:
|
9791
|
+
case GGML_OP_CONT:
|
9792
|
+
case GGML_OP_DIAG_MASK_INF:
|
9793
|
+
case GGML_OP_SOFT_MAX:
|
9794
|
+
case GGML_OP_ROPE:
|
9795
|
+
case GGML_OP_ALIBI:
|
9796
|
+
case GGML_OP_IM2COL:
|
9797
|
+
case GGML_OP_SUM_ROWS:
|
9798
|
+
case GGML_OP_ARGSORT:
|
9799
|
+
case GGML_OP_ACC:
|
9800
|
+
case GGML_OP_CONCAT:
|
9801
|
+
case GGML_OP_GROUP_NORM:
|
9802
|
+
case GGML_OP_UPSCALE:
|
9803
|
+
case GGML_OP_PAD:
|
9804
|
+
case GGML_OP_LEAKY_RELU:
|
9805
|
+
return true;
|
9806
|
+
default:
|
9807
|
+
return false;
|
9808
|
+
}
|
9809
|
+
|
9810
|
+
UNUSED(backend);
|
9811
|
+
}
|
9812
|
+
|
8463
9813
|
static ggml_backend_i cuda_backend_i = {
|
8464
|
-
/* .get_name
|
8465
|
-
/* .free
|
8466
|
-
/* .
|
8467
|
-
/* .
|
8468
|
-
/* .
|
8469
|
-
/* .
|
8470
|
-
/* .
|
8471
|
-
/* .
|
8472
|
-
/* .
|
8473
|
-
/* .
|
8474
|
-
/* .
|
8475
|
-
/* .
|
8476
|
-
/* .
|
8477
|
-
/* .supports_op = */ nullptr,
|
9814
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9815
|
+
/* .free = */ ggml_backend_cuda_free,
|
9816
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9817
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9818
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9819
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9820
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9821
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9822
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9823
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9824
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9825
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9826
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8478
9827
|
};
|
8479
9828
|
|
8480
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9829
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8481
9830
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8482
9831
|
|
8483
|
-
|
9832
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9833
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9834
|
+
return nullptr;
|
9835
|
+
}
|
9836
|
+
|
9837
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9838
|
+
ggml_cuda_set_main_device(device);
|
9839
|
+
|
9840
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9841
|
+
/* .device = */ device
|
9842
|
+
};
|
8484
9843
|
|
8485
9844
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8486
9845
|
/* .interface = */ cuda_backend_i,
|
@@ -8489,3 +9848,27 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8489
9848
|
|
8490
9849
|
return cuda_backend;
|
8491
9850
|
}
|
9851
|
+
|
9852
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9853
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9854
|
+
}
|
9855
|
+
|
9856
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9857
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9858
|
+
return cuda_backend;
|
9859
|
+
|
9860
|
+
UNUSED(params);
|
9861
|
+
}
|
9862
|
+
|
9863
|
+
extern "C" int ggml_backend_cuda_reg_devices();
|
9864
|
+
|
9865
|
+
int ggml_backend_cuda_reg_devices() {
|
9866
|
+
int device_count = ggml_cuda_get_device_count();
|
9867
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9868
|
+
for (int i = 0; i < device_count; i++) {
|
9869
|
+
char name[128];
|
9870
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9871
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9872
|
+
}
|
9873
|
+
return device_count;
|
9874
|
+
}
|