llama_cpp 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
@@ -1,12 +1,15 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <atomic>
|
2
4
|
#include <cinttypes>
|
3
5
|
#include <cstddef>
|
4
6
|
#include <cstdint>
|
7
|
+
#include <float.h>
|
5
8
|
#include <limits>
|
6
9
|
#include <stdint.h>
|
7
10
|
#include <stdio.h>
|
8
|
-
#include <
|
9
|
-
|
11
|
+
#include <vector>
|
12
|
+
|
10
13
|
|
11
14
|
#if defined(GGML_USE_HIPBLAS)
|
12
15
|
#include <hip/hip_runtime.h>
|
@@ -69,6 +72,7 @@
|
|
69
72
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
73
|
#define cudaSetDevice hipSetDevice
|
71
74
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
75
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
76
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
77
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
78
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +194,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
194
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
195
|
cudaGetErrorString(err_)); \
|
192
196
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
197
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
198
|
} \
|
195
199
|
} while (0)
|
196
200
|
|
@@ -204,7 +208,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
208
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
209
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
210
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
211
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
212
|
} \
|
209
213
|
} while (0)
|
210
214
|
#else
|
@@ -216,7 +220,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
220
|
cudaGetDevice(&id); \
|
217
221
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
222
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
223
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
224
|
} \
|
221
225
|
} while (0)
|
222
226
|
#endif // CUDART_VERSION >= 11
|
@@ -433,10 +437,9 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
437
|
#define WARP_SIZE 32
|
434
438
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
439
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
440
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
441
|
#define CUDA_SILU_BLOCK_SIZE 256
|
442
|
+
#define CUDA_TANH_BLOCK_SIZE 256
|
440
443
|
#define CUDA_RELU_BLOCK_SIZE 256
|
441
444
|
#define CUDA_SQR_BLOCK_SIZE 256
|
442
445
|
#define CUDA_CPY_BLOCK_SIZE 32
|
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
449
452
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
450
453
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
451
454
|
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
455
|
+
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
456
|
+
#define CUDA_CONCAT_BLOCK_SIZE 256
|
457
|
+
#define CUDA_PAD_BLOCK_SIZE 256
|
458
|
+
#define CUDA_ACC_BLOCK_SIZE 256
|
459
|
+
#define CUDA_IM2COL_BLOCK_SIZE 256
|
452
460
|
|
453
461
|
// dmmv = dequantize_mul_mat_vec
|
454
462
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -527,40 +535,105 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
527
535
|
return x;
|
528
536
|
}
|
529
537
|
|
530
|
-
static
|
531
|
-
|
538
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
539
|
+
return b;
|
540
|
+
}
|
532
541
|
|
533
|
-
|
534
|
-
|
535
|
-
}
|
536
|
-
dst[i] = x[i] + y[i%ky];
|
542
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
543
|
+
return a + b;
|
537
544
|
}
|
538
545
|
|
539
|
-
static
|
540
|
-
|
546
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
547
|
+
return a * b;
|
548
|
+
}
|
541
549
|
|
542
|
-
|
550
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
551
|
+
return a / b;
|
552
|
+
}
|
553
|
+
|
554
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
555
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
556
|
+
int ne0, int ne1, int ne2, int ne3,
|
557
|
+
int ne10, int ne11, int ne12, int ne13,
|
558
|
+
/*int s0, */ int s1, int s2, int s3,
|
559
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
560
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
561
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
562
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
563
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
564
|
+
|
565
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
543
566
|
return;
|
544
567
|
}
|
545
|
-
|
568
|
+
|
569
|
+
const int i11 = i1 % ne11;
|
570
|
+
const int i12 = i2 % ne12;
|
571
|
+
const int i13 = i3 % ne13;
|
572
|
+
|
573
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
574
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
575
|
+
const size_t i_dst = i_src0;
|
576
|
+
|
577
|
+
const src0_t * src0_row = src0 + i_src0;
|
578
|
+
const src1_t * src1_row = src1 + i_src1;
|
579
|
+
dst_t * dst_row = dst + i_dst;
|
580
|
+
|
581
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
582
|
+
const int i10 = i0 % ne10;
|
583
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
584
|
+
}
|
546
585
|
}
|
547
586
|
|
548
|
-
|
587
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
588
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
589
|
+
int ne0, int ne1, int ne2, int ne3,
|
590
|
+
int ne10, int ne11, int ne12, int ne13,
|
591
|
+
/*int s0, */ int s1, int s2, int s3,
|
592
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
593
|
+
|
549
594
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
550
595
|
|
551
|
-
|
596
|
+
const int i3 = i/(ne2*ne1*ne0);
|
597
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
598
|
+
const int i1 = (i/ne0) % ne1;
|
599
|
+
const int i0 = i % ne0;
|
600
|
+
|
601
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
552
602
|
return;
|
553
603
|
}
|
554
|
-
dst[i] = __half2float(x[i]) + y[i];
|
555
|
-
}
|
556
604
|
|
557
|
-
|
558
|
-
const int
|
605
|
+
const int i11 = i1 % ne11;
|
606
|
+
const int i12 = i2 % ne12;
|
607
|
+
const int i13 = i3 % ne13;
|
608
|
+
|
609
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
610
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
611
|
+
const size_t i_dst = i_src0;
|
612
|
+
|
613
|
+
const src0_t * src0_row = src0 + i_src0;
|
614
|
+
const src1_t * src1_row = src1 + i_src1;
|
615
|
+
dst_t * dst_row = dst + i_dst;
|
616
|
+
|
617
|
+
const int i10 = i0 % ne10;
|
618
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
619
|
+
}
|
559
620
|
|
560
|
-
|
621
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
622
|
+
const int ne10, const int ne11, const int ne12,
|
623
|
+
const int nb1, const int nb2, int offset) {
|
624
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
625
|
+
if (i >= ne) {
|
561
626
|
return;
|
562
627
|
}
|
563
|
-
|
628
|
+
int src1_idx = i - offset;
|
629
|
+
int oz = src1_idx / nb2;
|
630
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
631
|
+
int ox = src1_idx % nb1;
|
632
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
633
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
634
|
+
} else {
|
635
|
+
dst[i] = x[i];
|
636
|
+
}
|
564
637
|
}
|
565
638
|
|
566
639
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -585,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
585
658
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
586
659
|
}
|
587
660
|
|
661
|
+
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
662
|
+
const float GELU_QUICK_COEF = -1.702f;
|
663
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
664
|
+
if (i >= k) {
|
665
|
+
return;
|
666
|
+
}
|
667
|
+
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
668
|
+
}
|
669
|
+
|
670
|
+
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
671
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
672
|
+
if (i >= k) {
|
673
|
+
return;
|
674
|
+
}
|
675
|
+
dst[i] = tanhf(x[i]);
|
676
|
+
}
|
677
|
+
|
588
678
|
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
589
679
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
590
680
|
|
@@ -594,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
594
684
|
dst[i] = fmaxf(x[i], 0);
|
595
685
|
}
|
596
686
|
|
687
|
+
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
688
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
689
|
+
if (i >= k) {
|
690
|
+
return;
|
691
|
+
}
|
692
|
+
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
|
693
|
+
}
|
694
|
+
|
597
695
|
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
598
696
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
599
697
|
|
@@ -604,12 +702,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
604
702
|
}
|
605
703
|
|
606
704
|
template <int block_size>
|
607
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
705
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
608
706
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
609
707
|
const int tid = threadIdx.x;
|
610
708
|
|
611
|
-
const float eps = 1e-5f;
|
612
|
-
|
613
709
|
float2 mean_var = make_float2(0.f, 0.f);
|
614
710
|
|
615
711
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -641,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
641
737
|
}
|
642
738
|
}
|
643
739
|
|
740
|
+
static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
|
741
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
742
|
+
if (nidx >= ne0) {
|
743
|
+
return;
|
744
|
+
}
|
745
|
+
// operation
|
746
|
+
int offset_dst =
|
747
|
+
nidx +
|
748
|
+
blockIdx.y * ne0 +
|
749
|
+
blockIdx.z * ne0 * gridDim.y;
|
750
|
+
if (blockIdx.z < ne02) { // src0
|
751
|
+
int offset_src =
|
752
|
+
nidx +
|
753
|
+
blockIdx.y * ne0 +
|
754
|
+
blockIdx.z * ne0 * gridDim.y;
|
755
|
+
dst[offset_dst] = x[offset_src];
|
756
|
+
} else {
|
757
|
+
int offset_src =
|
758
|
+
nidx +
|
759
|
+
blockIdx.y * ne0 +
|
760
|
+
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
761
|
+
dst[offset_dst] = y[offset_src];
|
762
|
+
}
|
763
|
+
}
|
764
|
+
|
765
|
+
static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
|
766
|
+
int ne0 = ne00 * scale_factor;
|
767
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
768
|
+
if (nidx >= ne0) {
|
769
|
+
return;
|
770
|
+
}
|
771
|
+
// operation
|
772
|
+
int i00 = nidx / scale_factor;
|
773
|
+
int i01 = blockIdx.y / scale_factor;
|
774
|
+
int offset_src =
|
775
|
+
i00 +
|
776
|
+
i01 * ne00 +
|
777
|
+
blockIdx.z * nb02;
|
778
|
+
int offset_dst =
|
779
|
+
nidx +
|
780
|
+
blockIdx.y * ne0 +
|
781
|
+
blockIdx.z * ne0 * gridDim.y;
|
782
|
+
dst[offset_dst] = x[offset_src];
|
783
|
+
}
|
784
|
+
|
785
|
+
static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
786
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
787
|
+
if (nidx >= ne0) {
|
788
|
+
return;
|
789
|
+
}
|
790
|
+
|
791
|
+
// operation
|
792
|
+
int offset_dst =
|
793
|
+
nidx +
|
794
|
+
blockIdx.y * ne0 +
|
795
|
+
blockIdx.z * ne0 * gridDim.y;
|
796
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
797
|
+
int offset_src =
|
798
|
+
nidx +
|
799
|
+
blockIdx.y * ne00 +
|
800
|
+
blockIdx.z * ne00 * ne01;
|
801
|
+
dst[offset_dst] = x[offset_src];
|
802
|
+
} else {
|
803
|
+
dst[offset_dst] = 0.0f;
|
804
|
+
}
|
805
|
+
}
|
806
|
+
|
807
|
+
template <int block_size>
|
808
|
+
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
809
|
+
int start = blockIdx.x * group_size;
|
810
|
+
int end = start + group_size;
|
811
|
+
|
812
|
+
start += threadIdx.x;
|
813
|
+
|
814
|
+
if (end >= ne_elements) {
|
815
|
+
end = ne_elements;
|
816
|
+
}
|
817
|
+
|
818
|
+
float tmp = 0.0f; // partial sum for thread in warp
|
819
|
+
|
820
|
+
for (int j = start; j < end; j += block_size) {
|
821
|
+
tmp += x[j];
|
822
|
+
}
|
823
|
+
|
824
|
+
tmp = warp_reduce_sum(tmp);
|
825
|
+
if (block_size > WARP_SIZE) {
|
826
|
+
__shared__ float s_sum[32];
|
827
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
828
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
829
|
+
if (lane_id == 0) {
|
830
|
+
s_sum[warp_id] = tmp;
|
831
|
+
}
|
832
|
+
__syncthreads();
|
833
|
+
tmp = s_sum[lane_id];
|
834
|
+
tmp = warp_reduce_sum(tmp);
|
835
|
+
}
|
836
|
+
|
837
|
+
float mean = tmp / group_size;
|
838
|
+
tmp = 0.0f;
|
839
|
+
|
840
|
+
for (int j = start; j < end; j += block_size) {
|
841
|
+
float xi = x[j] - mean;
|
842
|
+
dst[j] = xi;
|
843
|
+
tmp += xi * xi;
|
844
|
+
}
|
845
|
+
|
846
|
+
tmp = warp_reduce_sum(tmp);
|
847
|
+
if (block_size > WARP_SIZE) {
|
848
|
+
__shared__ float s_sum[32];
|
849
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
850
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
851
|
+
if (lane_id == 0) {
|
852
|
+
s_sum[warp_id] = tmp;
|
853
|
+
}
|
854
|
+
__syncthreads();
|
855
|
+
tmp = s_sum[lane_id];
|
856
|
+
tmp = warp_reduce_sum(tmp);
|
857
|
+
}
|
858
|
+
|
859
|
+
float variance = tmp / group_size;
|
860
|
+
float scale = rsqrtf(variance + eps);
|
861
|
+
for (int j = start; j < end; j += block_size) {
|
862
|
+
dst[j] *= scale;
|
863
|
+
}
|
864
|
+
}
|
865
|
+
|
644
866
|
template <int block_size>
|
645
867
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
646
868
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -1639,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1639
1861
|
}
|
1640
1862
|
|
1641
1863
|
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1642
|
-
static __global__ void k_get_rows(
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1864
|
+
static __global__ void k_get_rows(
|
1865
|
+
const void * src0, const int32_t * src1, dst_t * dst,
|
1866
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1867
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1868
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1869
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1870
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1871
|
+
|
1872
|
+
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1873
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1874
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1875
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1876
|
+
|
1877
|
+
if (i00 >= ne00) {
|
1647
1878
|
return;
|
1648
1879
|
}
|
1649
1880
|
|
1650
|
-
const int
|
1881
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1651
1882
|
|
1652
|
-
|
1653
|
-
const
|
1654
|
-
const int di = row*ncols + col;
|
1883
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1884
|
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
1655
1885
|
|
1656
|
-
const int ib =
|
1657
|
-
const int iqs = (
|
1658
|
-
const int iybs =
|
1886
|
+
const int ib = i00/qk; // block index
|
1887
|
+
const int iqs = (i00%qk)/qr; // quant index
|
1888
|
+
const int iybs = i00 - i00%qk; // dst block start index
|
1659
1889
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1660
1890
|
|
1661
1891
|
// dequantize
|
1662
1892
|
dfloat2 v;
|
1663
|
-
dequantize_kernel(
|
1893
|
+
dequantize_kernel(src0_row, ib, iqs, v);
|
1894
|
+
|
1895
|
+
dst_row[iybs + iqs + 0] = v.x;
|
1896
|
+
dst_row[iybs + iqs + y_offset] = v.y;
|
1897
|
+
}
|
1898
|
+
|
1899
|
+
template<typename src0_t, typename dst_t>
|
1900
|
+
static __global__ void k_get_rows_float(
|
1901
|
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
1902
|
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
1903
|
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
1904
|
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
1905
|
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
1906
|
+
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
1907
|
+
|
1908
|
+
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
1909
|
+
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
1910
|
+
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
1911
|
+
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
1664
1912
|
|
1665
|
-
|
1666
|
-
|
1913
|
+
if (i00 >= ne00) {
|
1914
|
+
return;
|
1915
|
+
}
|
1916
|
+
|
1917
|
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
1918
|
+
|
1919
|
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
1920
|
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
1921
|
+
|
1922
|
+
dst_row[i00] = src0_row[i00];
|
1667
1923
|
}
|
1668
1924
|
|
1669
1925
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
@@ -4559,6 +4815,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4559
4815
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4560
4816
|
}
|
4561
4817
|
|
4818
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4819
|
+
const float * xi = (const float *) cxi;
|
4820
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4821
|
+
|
4822
|
+
float amax = 0.0f; // absolute max
|
4823
|
+
|
4824
|
+
for (int j = 0; j < QK8_0; j++) {
|
4825
|
+
const float v = xi[j];
|
4826
|
+
amax = fmaxf(amax, fabsf(v));
|
4827
|
+
}
|
4828
|
+
|
4829
|
+
const float d = amax / ((1 << 7) - 1);
|
4830
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4831
|
+
|
4832
|
+
dsti->d = d;
|
4833
|
+
|
4834
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4835
|
+
const float x0 = xi[j]*id;
|
4836
|
+
|
4837
|
+
dsti->qs[j] = roundf(x0);
|
4838
|
+
}
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4842
|
+
const float * xi = (const float *) cxi;
|
4843
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4844
|
+
|
4845
|
+
float amax = 0.0f;
|
4846
|
+
float vmax = 0.0f;
|
4847
|
+
|
4848
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4849
|
+
const float v = xi[j];
|
4850
|
+
if (amax < fabsf(v)) {
|
4851
|
+
amax = fabsf(v);
|
4852
|
+
vmax = v;
|
4853
|
+
}
|
4854
|
+
}
|
4855
|
+
|
4856
|
+
const float d = vmax / -8;
|
4857
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4858
|
+
|
4859
|
+
dsti->d = d;
|
4860
|
+
|
4861
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4862
|
+
const float x0 = xi[0 + j]*id;
|
4863
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4864
|
+
|
4865
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4866
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4867
|
+
|
4868
|
+
dsti->qs[j] = xi0;
|
4869
|
+
dsti->qs[j] |= xi1 << 4;
|
4870
|
+
}
|
4871
|
+
}
|
4872
|
+
|
4873
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4874
|
+
const float * xi = (const float *) cxi;
|
4875
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4876
|
+
|
4877
|
+
float vmin = FLT_MAX;
|
4878
|
+
float vmax = -FLT_MAX;
|
4879
|
+
|
4880
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4881
|
+
const float v = xi[j];
|
4882
|
+
|
4883
|
+
if (v < vmin) vmin = v;
|
4884
|
+
if (v > vmax) vmax = v;
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4888
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4889
|
+
|
4890
|
+
dsti->dm.x = d;
|
4891
|
+
dsti->dm.y = vmin;
|
4892
|
+
|
4893
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4894
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4895
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4896
|
+
|
4897
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4898
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4899
|
+
|
4900
|
+
dsti->qs[j] = xi0;
|
4901
|
+
dsti->qs[j] |= xi1 << 4;
|
4902
|
+
}
|
4903
|
+
}
|
4904
|
+
|
4905
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4906
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4907
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4908
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4909
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4910
|
+
|
4911
|
+
if (i >= ne) {
|
4912
|
+
return;
|
4913
|
+
}
|
4914
|
+
|
4915
|
+
const int i02 = i / (ne00*ne01);
|
4916
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4917
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4918
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4919
|
+
|
4920
|
+
const int i12 = i / (ne10*ne11);
|
4921
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4922
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4923
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4924
|
+
|
4925
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4926
|
+
}
|
4927
|
+
|
4562
4928
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4563
4929
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4564
4930
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4713,6 +5079,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4713
5079
|
dst[i] = col * m_k + x[i];
|
4714
5080
|
}
|
4715
5081
|
|
5082
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
5083
|
+
const int row = blockIdx.y;
|
5084
|
+
const int col = threadIdx.x;
|
5085
|
+
|
5086
|
+
float sum = 0.0f;
|
5087
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
5088
|
+
sum += x[row * ncols + i];
|
5089
|
+
}
|
5090
|
+
|
5091
|
+
sum = warp_reduce_sum(sum);
|
5092
|
+
|
5093
|
+
if (col == 0) {
|
5094
|
+
dst[row] = sum;
|
5095
|
+
}
|
5096
|
+
}
|
5097
|
+
|
5098
|
+
template<typename T>
|
5099
|
+
static inline __device__ void swap(T & a, T & b) {
|
5100
|
+
T tmp = a;
|
5101
|
+
a = b;
|
5102
|
+
b = tmp;
|
5103
|
+
}
|
5104
|
+
|
5105
|
+
template<ggml_sort_order order>
|
5106
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
5107
|
+
// bitonic sort
|
5108
|
+
int col = threadIdx.x;
|
5109
|
+
int row = blockIdx.y;
|
5110
|
+
|
5111
|
+
if (col >= ncols) return;
|
5112
|
+
|
5113
|
+
const float * x_row = x + row * ncols;
|
5114
|
+
int * dst_row = dst + row * ncols;
|
5115
|
+
|
5116
|
+
// initialize indices
|
5117
|
+
if (col < ncols) {
|
5118
|
+
dst_row[col] = col;
|
5119
|
+
}
|
5120
|
+
__syncthreads();
|
5121
|
+
|
5122
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
5123
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
5124
|
+
int ixj = col ^ j;
|
5125
|
+
if (ixj > col) {
|
5126
|
+
if ((col & k) == 0) {
|
5127
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
5128
|
+
swap(dst_row[col], dst_row[ixj]);
|
5129
|
+
}
|
5130
|
+
} else {
|
5131
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
5132
|
+
swap(dst_row[col], dst_row[ixj]);
|
5133
|
+
}
|
5134
|
+
}
|
5135
|
+
}
|
5136
|
+
__syncthreads();
|
5137
|
+
}
|
5138
|
+
}
|
5139
|
+
}
|
5140
|
+
|
4716
5141
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4717
5142
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4718
5143
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4722,8 +5147,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4722
5147
|
}
|
4723
5148
|
|
4724
5149
|
const int i = row*ncols + col;
|
4725
|
-
//
|
4726
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
5150
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
5151
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
5152
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4727
5153
|
}
|
4728
5154
|
|
4729
5155
|
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
@@ -4820,49 +5246,220 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4820
5246
|
|
4821
5247
|
static __global__ void im2col_f32_f16(
|
4822
5248
|
const float * x, half * dst,
|
4823
|
-
int
|
5249
|
+
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
4824
5250
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4825
|
-
const int
|
4826
|
-
|
5251
|
+
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
5252
|
+
if (i >= pelements) {
|
5253
|
+
return;
|
5254
|
+
}
|
5255
|
+
|
5256
|
+
const int ksize = OW * (KH > 1 ? KW : 1);
|
5257
|
+
const int kx = i / ksize;
|
5258
|
+
const int kd = kx * ksize;
|
5259
|
+
const int ky = (i - kd) / OW;
|
5260
|
+
const int ix = i % OW;
|
5261
|
+
|
5262
|
+
const int iiw = ix * s0 + kx * d0 - p0;
|
5263
|
+
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
4827
5264
|
|
4828
5265
|
const int offset_dst =
|
4829
|
-
(
|
4830
|
-
(blockIdx.
|
5266
|
+
(blockIdx.y * OW + ix) * CHW +
|
5267
|
+
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
4831
5268
|
|
4832
5269
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4833
5270
|
dst[offset_dst] = __float2half(0.0f);
|
4834
5271
|
} else {
|
4835
|
-
const int offset_src =
|
5272
|
+
const int offset_src = blockIdx.z * offset_delta;
|
4836
5273
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4837
5274
|
}
|
4838
5275
|
}
|
4839
5276
|
|
4840
5277
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4841
|
-
static void get_rows_cuda(const
|
5278
|
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5279
|
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5280
|
+
|
5281
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5282
|
+
|
4842
5283
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4843
|
-
const int block_num_x = (
|
4844
|
-
const dim3 block_nums(block_num_x,
|
4845
|
-
|
4846
|
-
|
5284
|
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
5285
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5286
|
+
|
5287
|
+
// strides in elements
|
5288
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5289
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5290
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5291
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5292
|
+
|
5293
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5294
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5295
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5296
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5297
|
+
|
5298
|
+
GGML_ASSERT(ne00 % 2 == 0);
|
5299
|
+
|
5300
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
5301
|
+
src0_dd, src1_dd, dst_dd,
|
5302
|
+
ne00, /*ne01, ne02, ne03,*/
|
5303
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5304
|
+
/* s0,*/ s1, s2, s3,
|
5305
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5306
|
+
s10, s11, s12/*, s13*/);
|
4847
5307
|
|
4848
|
-
|
4849
|
-
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4850
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
5308
|
+
(void) dst;
|
4851
5309
|
}
|
4852
5310
|
|
4853
|
-
|
4854
|
-
|
4855
|
-
|
4856
|
-
|
5311
|
+
template<typename src0_t>
|
5312
|
+
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5313
|
+
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
5314
|
+
|
5315
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
4857
5316
|
|
4858
|
-
|
4859
|
-
const int
|
4860
|
-
|
5317
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
5318
|
+
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
5319
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
5320
|
+
|
5321
|
+
// strides in elements
|
5322
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
5323
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
5324
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
5325
|
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
5326
|
+
|
5327
|
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
5328
|
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
5329
|
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
5330
|
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
5331
|
+
|
5332
|
+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
5333
|
+
src0_dd, src1_dd, dst_dd,
|
5334
|
+
ne00, /*ne01, ne02, ne03,*/
|
5335
|
+
/*ne10, ne11,*/ ne12, /*ne13,*/
|
5336
|
+
/* s0,*/ s1, s2, s3,
|
5337
|
+
/* nb00,*/ nb01, nb02, nb03,
|
5338
|
+
s10, s11, s12/*, s13*/);
|
5339
|
+
|
5340
|
+
(void) dst;
|
4861
5341
|
}
|
4862
5342
|
|
4863
|
-
|
4864
|
-
|
4865
|
-
|
5343
|
+
template<float (*bin_op)(const float, const float)>
|
5344
|
+
struct bin_bcast_cuda {
|
5345
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5346
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5347
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5348
|
+
cudaStream_t stream) {
|
5349
|
+
|
5350
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5351
|
+
|
5352
|
+
int nr0 = ne10/ne0;
|
5353
|
+
int nr1 = ne11/ne1;
|
5354
|
+
int nr2 = ne12/ne2;
|
5355
|
+
int nr3 = ne13/ne3;
|
5356
|
+
|
5357
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5358
|
+
|
5359
|
+
// collapse dimensions until first broadcast dimension
|
5360
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5361
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5362
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5363
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5364
|
+
auto collapse = [](int64_t cne[]) {
|
5365
|
+
cne[0] *= cne[1];
|
5366
|
+
cne[1] = cne[2];
|
5367
|
+
cne[2] = cne[3];
|
5368
|
+
cne[3] = 1;
|
5369
|
+
};
|
5370
|
+
|
5371
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5372
|
+
cnb[1] *= cne[1];
|
5373
|
+
cnb[2] *= cne[2];
|
5374
|
+
cnb[3] *= cne[3];
|
5375
|
+
};
|
5376
|
+
|
5377
|
+
for (int i = 0; i < 4; i++) {
|
5378
|
+
if (nr[i] != 1) {
|
5379
|
+
break;
|
5380
|
+
}
|
5381
|
+
if (i > 0) {
|
5382
|
+
collapse_nb(cnb0, cne0);
|
5383
|
+
collapse_nb(cnb1, cne1);
|
5384
|
+
collapse(cne0);
|
5385
|
+
collapse(cne1);
|
5386
|
+
}
|
5387
|
+
}
|
5388
|
+
{
|
5389
|
+
int64_t ne0 = cne0[0];
|
5390
|
+
int64_t ne1 = cne0[1];
|
5391
|
+
int64_t ne2 = cne0[2];
|
5392
|
+
int64_t ne3 = cne0[3];
|
5393
|
+
|
5394
|
+
int64_t ne10 = cne1[0];
|
5395
|
+
int64_t ne11 = cne1[1];
|
5396
|
+
int64_t ne12 = cne1[2];
|
5397
|
+
int64_t ne13 = cne1[3];
|
5398
|
+
|
5399
|
+
size_t nb0 = cnb0[0];
|
5400
|
+
size_t nb1 = cnb0[1];
|
5401
|
+
size_t nb2 = cnb0[2];
|
5402
|
+
size_t nb3 = cnb0[3];
|
5403
|
+
|
5404
|
+
size_t nb10 = cnb1[0];
|
5405
|
+
size_t nb11 = cnb1[1];
|
5406
|
+
size_t nb12 = cnb1[2];
|
5407
|
+
size_t nb13 = cnb1[3];
|
5408
|
+
|
5409
|
+
size_t s0 = nb0 / sizeof(dst_t);
|
5410
|
+
size_t s1 = nb1 / sizeof(dst_t);
|
5411
|
+
size_t s2 = nb2 / sizeof(dst_t);
|
5412
|
+
size_t s3 = nb3 / sizeof(dst_t);
|
5413
|
+
|
5414
|
+
size_t s10 = nb10 / sizeof(src1_t);
|
5415
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5416
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5417
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5418
|
+
|
5419
|
+
GGML_ASSERT(s0 == 1);
|
5420
|
+
GGML_ASSERT(s10 == 1);
|
5421
|
+
|
5422
|
+
const int block_size = 128;
|
5423
|
+
|
5424
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5425
|
+
|
5426
|
+
dim3 block_dims;
|
5427
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5428
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5429
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5430
|
+
|
5431
|
+
dim3 block_nums(
|
5432
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5433
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5434
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5435
|
+
);
|
5436
|
+
|
5437
|
+
if (block_nums.z > 65535) {
|
5438
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5439
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5440
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5441
|
+
src0_dd, src1_dd, dst_dd,
|
5442
|
+
ne0, ne1, ne2, ne3,
|
5443
|
+
ne10, ne11, ne12, ne13,
|
5444
|
+
/* s0, */ s1, s2, s3,
|
5445
|
+
/* s10, */ s11, s12, s13);
|
5446
|
+
} else {
|
5447
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5448
|
+
src0_dd, src1_dd, dst_dd,
|
5449
|
+
ne0, ne1, ne2, ne3,
|
5450
|
+
ne10, ne11, ne12, ne13,
|
5451
|
+
/* s0, */ s1, s2, s3,
|
5452
|
+
/* s10, */ s11, s12, s13);
|
5453
|
+
}
|
5454
|
+
}
|
5455
|
+
}
|
5456
|
+
};
|
5457
|
+
|
5458
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
5459
|
+
const int ne10, const int ne11, const int ne12,
|
5460
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
5461
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
5462
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
4866
5463
|
}
|
4867
5464
|
|
4868
5465
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
@@ -4875,27 +5472,74 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4875
5472
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4876
5473
|
}
|
4877
5474
|
|
5475
|
+
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5476
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
5477
|
+
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5478
|
+
}
|
5479
|
+
|
5480
|
+
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
5481
|
+
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
5482
|
+
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
5483
|
+
}
|
5484
|
+
|
4878
5485
|
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4879
5486
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4880
5487
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4881
5488
|
}
|
4882
5489
|
|
5490
|
+
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
5491
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
5492
|
+
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
5493
|
+
}
|
5494
|
+
|
4883
5495
|
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4884
5496
|
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4885
5497
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4886
5498
|
}
|
4887
5499
|
|
4888
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5500
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4889
5501
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4890
5502
|
if (ncols < 1024) {
|
4891
5503
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4892
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5504
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
5505
|
+
} else {
|
5506
|
+
const dim3 block_dims(1024, 1, 1);
|
5507
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
5508
|
+
}
|
5509
|
+
}
|
5510
|
+
|
5511
|
+
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
5512
|
+
static const float eps = 1e-6f;
|
5513
|
+
if (group_size < 1024) {
|
5514
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
5515
|
+
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
4893
5516
|
} else {
|
4894
5517
|
const dim3 block_dims(1024, 1, 1);
|
4895
|
-
|
5518
|
+
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
4896
5519
|
}
|
4897
5520
|
}
|
4898
5521
|
|
5522
|
+
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
5523
|
+
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
5524
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5525
|
+
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
5526
|
+
}
|
5527
|
+
|
5528
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
|
5529
|
+
int ne0 = (ne00 * scale_factor);
|
5530
|
+
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
5531
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
5532
|
+
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
static void pad_f32_cuda(const float * x, float * dst,
|
5536
|
+
const int ne00, const int ne01, const int ne02,
|
5537
|
+
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
5538
|
+
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
5539
|
+
dim3 gridDim(num_blocks, ne1, ne2);
|
5540
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
5541
|
+
}
|
5542
|
+
|
4899
5543
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4900
5544
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4901
5545
|
if (ncols < 1024) {
|
@@ -4914,34 +5558,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4914
5558
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4915
5559
|
}
|
4916
5560
|
|
4917
|
-
template<typename dst_t>
|
4918
|
-
static void
|
4919
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4920
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4921
|
-
}
|
4922
|
-
|
4923
|
-
template<typename dst_t>
|
4924
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4925
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4926
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4927
|
-
}
|
4928
|
-
|
4929
|
-
template<typename dst_t>
|
4930
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4931
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4932
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4933
|
-
}
|
4934
|
-
|
4935
|
-
template<typename dst_t>
|
4936
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4937
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4938
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4939
|
-
}
|
4940
|
-
|
4941
|
-
template<typename dst_t>
|
4942
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5561
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5562
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4943
5563
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4944
|
-
dequantize_block<
|
5564
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4945
5565
|
}
|
4946
5566
|
|
4947
5567
|
template<typename dst_t>
|
@@ -4990,6 +5610,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4990
5610
|
#endif
|
4991
5611
|
}
|
4992
5612
|
|
5613
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5614
|
+
switch (type) {
|
5615
|
+
case GGML_TYPE_Q4_0:
|
5616
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5617
|
+
case GGML_TYPE_Q4_1:
|
5618
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5619
|
+
case GGML_TYPE_Q5_0:
|
5620
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5621
|
+
case GGML_TYPE_Q5_1:
|
5622
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5623
|
+
case GGML_TYPE_Q8_0:
|
5624
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5625
|
+
case GGML_TYPE_Q2_K:
|
5626
|
+
return dequantize_row_q2_K_cuda;
|
5627
|
+
case GGML_TYPE_Q3_K:
|
5628
|
+
return dequantize_row_q3_K_cuda;
|
5629
|
+
case GGML_TYPE_Q4_K:
|
5630
|
+
return dequantize_row_q4_K_cuda;
|
5631
|
+
case GGML_TYPE_Q5_K:
|
5632
|
+
return dequantize_row_q5_K_cuda;
|
5633
|
+
case GGML_TYPE_Q6_K:
|
5634
|
+
return dequantize_row_q6_K_cuda;
|
5635
|
+
case GGML_TYPE_F32:
|
5636
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5637
|
+
default:
|
5638
|
+
return nullptr;
|
5639
|
+
}
|
5640
|
+
}
|
5641
|
+
|
5642
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5643
|
+
switch (type) {
|
5644
|
+
case GGML_TYPE_Q4_0:
|
5645
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5646
|
+
case GGML_TYPE_Q4_1:
|
5647
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5648
|
+
case GGML_TYPE_Q5_0:
|
5649
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5650
|
+
case GGML_TYPE_Q5_1:
|
5651
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5652
|
+
case GGML_TYPE_Q8_0:
|
5653
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5654
|
+
case GGML_TYPE_Q2_K:
|
5655
|
+
return dequantize_row_q2_K_cuda;
|
5656
|
+
case GGML_TYPE_Q3_K:
|
5657
|
+
return dequantize_row_q3_K_cuda;
|
5658
|
+
case GGML_TYPE_Q4_K:
|
5659
|
+
return dequantize_row_q4_K_cuda;
|
5660
|
+
case GGML_TYPE_Q5_K:
|
5661
|
+
return dequantize_row_q5_K_cuda;
|
5662
|
+
case GGML_TYPE_Q6_K:
|
5663
|
+
return dequantize_row_q6_K_cuda;
|
5664
|
+
case GGML_TYPE_F16:
|
5665
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5666
|
+
default:
|
5667
|
+
return nullptr;
|
5668
|
+
}
|
5669
|
+
}
|
5670
|
+
|
4993
5671
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4994
5672
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4995
5673
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5078,6 +5756,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5078
5756
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5079
5757
|
}
|
5080
5758
|
|
5759
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5760
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5761
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5762
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5763
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5764
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5765
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5766
|
+
}
|
5767
|
+
|
5081
5768
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5082
5769
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
5083
5770
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5168,83 +5855,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5168
5855
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5169
5856
|
}
|
5170
5857
|
|
5171
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5172
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5173
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5174
|
-
}
|
5175
|
-
|
5176
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5177
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5178
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5179
|
-
}
|
5180
|
-
|
5181
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5182
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5183
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5184
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5185
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5186
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5187
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5188
|
-
}
|
5189
|
-
|
5190
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5191
|
-
switch (type) {
|
5192
|
-
case GGML_TYPE_Q4_0:
|
5193
|
-
return dequantize_row_q4_0_cuda;
|
5194
|
-
case GGML_TYPE_Q4_1:
|
5195
|
-
return dequantize_row_q4_1_cuda;
|
5196
|
-
case GGML_TYPE_Q5_0:
|
5197
|
-
return dequantize_row_q5_0_cuda;
|
5198
|
-
case GGML_TYPE_Q5_1:
|
5199
|
-
return dequantize_row_q5_1_cuda;
|
5200
|
-
case GGML_TYPE_Q8_0:
|
5201
|
-
return dequantize_row_q8_0_cuda;
|
5202
|
-
case GGML_TYPE_Q2_K:
|
5203
|
-
return dequantize_row_q2_K_cuda;
|
5204
|
-
case GGML_TYPE_Q3_K:
|
5205
|
-
return dequantize_row_q3_K_cuda;
|
5206
|
-
case GGML_TYPE_Q4_K:
|
5207
|
-
return dequantize_row_q4_K_cuda;
|
5208
|
-
case GGML_TYPE_Q5_K:
|
5209
|
-
return dequantize_row_q5_K_cuda;
|
5210
|
-
case GGML_TYPE_Q6_K:
|
5211
|
-
return dequantize_row_q6_K_cuda;
|
5212
|
-
case GGML_TYPE_F32:
|
5213
|
-
return convert_fp32_to_fp16_cuda;
|
5214
|
-
default:
|
5215
|
-
return nullptr;
|
5216
|
-
}
|
5217
|
-
}
|
5218
|
-
|
5219
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5220
|
-
switch (type) {
|
5221
|
-
case GGML_TYPE_Q4_0:
|
5222
|
-
return dequantize_row_q4_0_cuda;
|
5223
|
-
case GGML_TYPE_Q4_1:
|
5224
|
-
return dequantize_row_q4_1_cuda;
|
5225
|
-
case GGML_TYPE_Q5_0:
|
5226
|
-
return dequantize_row_q5_0_cuda;
|
5227
|
-
case GGML_TYPE_Q5_1:
|
5228
|
-
return dequantize_row_q5_1_cuda;
|
5229
|
-
case GGML_TYPE_Q8_0:
|
5230
|
-
return dequantize_row_q8_0_cuda;
|
5231
|
-
case GGML_TYPE_Q2_K:
|
5232
|
-
return dequantize_row_q2_K_cuda;
|
5233
|
-
case GGML_TYPE_Q3_K:
|
5234
|
-
return dequantize_row_q3_K_cuda;
|
5235
|
-
case GGML_TYPE_Q4_K:
|
5236
|
-
return dequantize_row_q4_K_cuda;
|
5237
|
-
case GGML_TYPE_Q5_K:
|
5238
|
-
return dequantize_row_q5_K_cuda;
|
5239
|
-
case GGML_TYPE_Q6_K:
|
5240
|
-
return dequantize_row_q6_K_cuda;
|
5241
|
-
case GGML_TYPE_F16:
|
5242
|
-
return convert_fp16_to_fp32_cuda;
|
5243
|
-
default:
|
5244
|
-
return nullptr;
|
5245
|
-
}
|
5246
|
-
}
|
5247
|
-
|
5248
5858
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5249
5859
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5250
5860
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5737,6 +6347,39 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5737
6347
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5738
6348
|
}
|
5739
6349
|
|
6350
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
6351
|
+
const char * cx, char * cdst, const int ne,
|
6352
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6353
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6354
|
+
|
6355
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6356
|
+
const int num_blocks = ne / QK8_0;
|
6357
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
6358
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6359
|
+
}
|
6360
|
+
|
6361
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6362
|
+
const char * cx, char * cdst, const int ne,
|
6363
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6364
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6365
|
+
|
6366
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6367
|
+
const int num_blocks = ne / QK4_0;
|
6368
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6369
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6370
|
+
}
|
6371
|
+
|
6372
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6373
|
+
const char * cx, char * cdst, const int ne,
|
6374
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6375
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6376
|
+
|
6377
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6378
|
+
const int num_blocks = ne / QK4_1;
|
6379
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6380
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6381
|
+
}
|
6382
|
+
|
5740
6383
|
static void ggml_cpy_f16_f16_cuda(
|
5741
6384
|
const char * cx, char * cdst, const int ne,
|
5742
6385
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -5823,6 +6466,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5823
6466
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5824
6467
|
}
|
5825
6468
|
|
6469
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6470
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6471
|
+
const dim3 block_nums(1, nrows, 1);
|
6472
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6473
|
+
}
|
6474
|
+
|
6475
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6476
|
+
// bitonic sort requires ncols to be power of 2
|
6477
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6478
|
+
|
6479
|
+
const dim3 block_dims(ncols, 1, 1);
|
6480
|
+
const dim3 block_nums(1, nrows, 1);
|
6481
|
+
if (order == GGML_SORT_ASC) {
|
6482
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6483
|
+
} else if (order == GGML_SORT_DESC) {
|
6484
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6485
|
+
} else {
|
6486
|
+
GGML_ASSERT(false);
|
6487
|
+
}
|
6488
|
+
}
|
6489
|
+
|
5826
6490
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5827
6491
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5828
6492
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5838,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
5838
6502
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5839
6503
|
}
|
5840
6504
|
|
5841
|
-
static void im2col_f32_f16_cuda(const float
|
5842
|
-
int
|
5843
|
-
int
|
5844
|
-
int s0,
|
5845
|
-
|
5846
|
-
|
5847
|
-
|
6505
|
+
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
6506
|
+
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
6507
|
+
int offset_delta,
|
6508
|
+
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
6509
|
+
const int parallel_elements = OW * KW * KH;
|
6510
|
+
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
6511
|
+
dim3 block_nums(num_blocks, OH, IC);
|
6512
|
+
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5848
6513
|
}
|
5849
6514
|
|
5850
6515
|
// buffer pool for cuda
|
@@ -5915,7 +6580,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5915
6580
|
return ptr;
|
5916
6581
|
}
|
5917
6582
|
#ifdef DEBUG_CUDA_MALLOC
|
5918
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6583
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5919
6584
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5920
6585
|
#endif
|
5921
6586
|
void * ptr;
|
@@ -6053,7 +6718,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6053
6718
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6054
6719
|
// This can fixed the OOM error in WSL.
|
6055
6720
|
cudaGetLastError();
|
6056
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6721
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6057
6722
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6058
6723
|
return nullptr;
|
6059
6724
|
}
|
@@ -6098,75 +6763,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6098
6763
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6099
6764
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6100
6765
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6101
|
-
}
|
6102
|
-
if (nb0 == ts) {
|
6766
|
+
} else if (nb0 == ts) {
|
6103
6767
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6104
|
-
}
|
6105
|
-
|
6106
|
-
|
6107
|
-
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
}
|
6112
|
-
return cudaSuccess;
|
6113
|
-
}
|
6114
|
-
|
6115
|
-
static void ggml_cuda_op_repeat(
|
6116
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6117
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6118
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6119
|
-
const int64_t ne0 = dst->ne[0];
|
6120
|
-
const int64_t ne1 = dst->ne[1];
|
6121
|
-
const int64_t ne2 = dst->ne[2];
|
6122
|
-
const int64_t ne3 = dst->ne[3];
|
6123
|
-
|
6124
|
-
const int64_t ne00 = src0->ne[0];
|
6125
|
-
const int64_t ne01 = src0->ne[1];
|
6126
|
-
const int64_t ne02 = src0->ne[2];
|
6127
|
-
const int64_t ne03 = src0->ne[3];
|
6128
|
-
|
6129
|
-
const size_t nb0 = dst->nb[0];
|
6130
|
-
const size_t nb1 = dst->nb[1];
|
6131
|
-
const size_t nb2 = dst->nb[2];
|
6132
|
-
const size_t nb3 = dst->nb[3];
|
6133
|
-
|
6134
|
-
const size_t nb00 = src0->nb[0];
|
6135
|
-
const size_t nb01 = src0->nb[1];
|
6136
|
-
const size_t nb02 = src0->nb[2];
|
6137
|
-
const size_t nb03 = src0->nb[3];
|
6138
|
-
|
6139
|
-
const int nr0 = (int)(ne0/ne00);
|
6140
|
-
const int nr1 = (int)(ne1/ne01);
|
6141
|
-
const int nr2 = (int)(ne2/ne02);
|
6142
|
-
const int nr3 = (int)(ne3/ne03);
|
6143
|
-
|
6144
|
-
// TODO: support for transposed / permuted tensors
|
6145
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6146
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6147
|
-
|
6148
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6149
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6150
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6151
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6152
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6153
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6154
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6155
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6156
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6157
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6158
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6159
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6160
|
-
}
|
6161
|
-
}
|
6162
|
-
}
|
6163
|
-
}
|
6164
|
-
}
|
6768
|
+
} else {
|
6769
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6770
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6771
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6772
|
+
// pretend the row is a matrix with cols=1
|
6773
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6774
|
+
if (r != cudaSuccess) return r;
|
6165
6775
|
}
|
6776
|
+
return cudaSuccess;
|
6166
6777
|
}
|
6167
|
-
|
6168
|
-
(void) src1;
|
6169
|
-
(void) src1_d;
|
6170
6778
|
}
|
6171
6779
|
|
6172
6780
|
static void ggml_cuda_op_get_rows(
|
@@ -6175,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
|
|
6175
6783
|
|
6176
6784
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6177
6785
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
6178
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
6179
|
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
6180
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
6181
6786
|
|
6182
|
-
|
6183
|
-
|
6787
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
6788
|
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
6789
|
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
6184
6790
|
|
6185
6791
|
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
6186
6792
|
|
6187
6793
|
switch (src0->type) {
|
6188
6794
|
case GGML_TYPE_F16:
|
6189
|
-
|
6795
|
+
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
6190
6796
|
break;
|
6191
6797
|
case GGML_TYPE_F32:
|
6192
|
-
|
6798
|
+
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6193
6799
|
break;
|
6194
6800
|
case GGML_TYPE_Q4_0:
|
6195
|
-
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(
|
6801
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6196
6802
|
break;
|
6197
6803
|
case GGML_TYPE_Q4_1:
|
6198
|
-
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(
|
6804
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6199
6805
|
break;
|
6200
6806
|
case GGML_TYPE_Q5_0:
|
6201
|
-
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(
|
6807
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6202
6808
|
break;
|
6203
6809
|
case GGML_TYPE_Q5_1:
|
6204
|
-
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(
|
6810
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6205
6811
|
break;
|
6206
6812
|
case GGML_TYPE_Q8_0:
|
6207
|
-
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(
|
6813
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
6208
6814
|
break;
|
6209
6815
|
default:
|
6210
6816
|
// TODO: k-quants
|
@@ -6213,46 +6819,76 @@ static void ggml_cuda_op_get_rows(
|
|
6213
6819
|
}
|
6214
6820
|
}
|
6215
6821
|
|
6216
|
-
|
6822
|
+
template<class op>
|
6823
|
+
inline void ggml_cuda_op_bin_bcast(
|
6217
6824
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6218
6825
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6219
6826
|
|
6220
6827
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6221
6828
|
|
6222
|
-
const int64_t ne10 = src1->ne[0];
|
6223
|
-
const int64_t ne11 = src1->ne[1];
|
6224
|
-
|
6225
6829
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6226
|
-
|
6830
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6227
6831
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6228
|
-
|
6832
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6229
6833
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6230
|
-
|
6834
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6231
6835
|
} else {
|
6232
|
-
fprintf(stderr, "src0
|
6836
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6837
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6233
6838
|
GGML_ASSERT(false);
|
6234
6839
|
}
|
6840
|
+
}
|
6841
|
+
|
6842
|
+
static void ggml_cuda_op_repeat(
|
6843
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6844
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6845
|
+
|
6846
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6235
6847
|
|
6236
6848
|
(void) src1;
|
6237
|
-
(void)
|
6849
|
+
(void) src1_d;
|
6238
6850
|
}
|
6239
6851
|
|
6240
|
-
inline void
|
6852
|
+
inline void ggml_cuda_op_add(
|
6853
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6854
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6855
|
+
|
6856
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6857
|
+
}
|
6858
|
+
|
6859
|
+
inline void ggml_cuda_op_acc(
|
6241
6860
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6242
6861
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6243
6862
|
|
6244
6863
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6245
6864
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6246
6865
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6866
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
6247
6867
|
|
6248
|
-
|
6249
|
-
|
6868
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
6869
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
6870
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
6871
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
6250
6872
|
|
6251
|
-
|
6873
|
+
acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
6252
6874
|
|
6253
6875
|
(void) dst;
|
6254
6876
|
}
|
6255
6877
|
|
6878
|
+
inline void ggml_cuda_op_mul(
|
6879
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6880
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6881
|
+
|
6882
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6883
|
+
}
|
6884
|
+
|
6885
|
+
inline void ggml_cuda_op_div(
|
6886
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6887
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6888
|
+
|
6889
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6890
|
+
}
|
6891
|
+
|
6256
6892
|
inline void ggml_cuda_op_gelu(
|
6257
6893
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6258
6894
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6281,6 +6917,34 @@ inline void ggml_cuda_op_silu(
|
|
6281
6917
|
(void) src1_dd;
|
6282
6918
|
}
|
6283
6919
|
|
6920
|
+
inline void ggml_cuda_op_gelu_quick(
|
6921
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6922
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6923
|
+
|
6924
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6925
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6926
|
+
|
6927
|
+
gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6928
|
+
|
6929
|
+
(void) src1;
|
6930
|
+
(void) dst;
|
6931
|
+
(void) src1_dd;
|
6932
|
+
}
|
6933
|
+
|
6934
|
+
inline void ggml_cuda_op_tanh(
|
6935
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6936
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6937
|
+
|
6938
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6939
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6940
|
+
|
6941
|
+
tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6942
|
+
|
6943
|
+
(void) src1;
|
6944
|
+
(void) dst;
|
6945
|
+
(void) src1_dd;
|
6946
|
+
}
|
6947
|
+
|
6284
6948
|
inline void ggml_cuda_op_relu(
|
6285
6949
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6286
6950
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6295,38 +6959,38 @@ inline void ggml_cuda_op_relu(
|
|
6295
6959
|
(void) src1_dd;
|
6296
6960
|
}
|
6297
6961
|
|
6298
|
-
inline void
|
6962
|
+
inline void ggml_cuda_op_leaky_relu(
|
6299
6963
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6300
6964
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6301
6965
|
|
6302
6966
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6303
6967
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6304
6968
|
|
6305
|
-
|
6969
|
+
float negative_slope;
|
6970
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
6971
|
+
|
6972
|
+
leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
6306
6973
|
|
6307
6974
|
(void) src1;
|
6308
6975
|
(void) dst;
|
6309
6976
|
(void) src1_dd;
|
6310
6977
|
}
|
6311
6978
|
|
6312
|
-
inline void
|
6979
|
+
inline void ggml_cuda_op_sqr(
|
6313
6980
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6314
6981
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6315
6982
|
|
6316
6983
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6317
6984
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6318
6985
|
|
6319
|
-
|
6320
|
-
const int64_t nrows = ggml_nrows(src0);
|
6321
|
-
|
6322
|
-
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
6986
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6323
6987
|
|
6324
6988
|
(void) src1;
|
6325
6989
|
(void) dst;
|
6326
6990
|
(void) src1_dd;
|
6327
6991
|
}
|
6328
6992
|
|
6329
|
-
inline void
|
6993
|
+
inline void ggml_cuda_op_norm(
|
6330
6994
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6331
6995
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6332
6996
|
|
@@ -6339,26 +7003,111 @@ inline void ggml_cuda_op_rms_norm(
|
|
6339
7003
|
float eps;
|
6340
7004
|
memcpy(&eps, dst->op_params, sizeof(float));
|
6341
7005
|
|
6342
|
-
|
7006
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6343
7007
|
|
6344
7008
|
(void) src1;
|
6345
7009
|
(void) dst;
|
6346
7010
|
(void) src1_dd;
|
6347
7011
|
}
|
6348
7012
|
|
6349
|
-
inline void ggml_cuda_op_mul_mat_q(
|
6350
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
6351
|
-
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6352
|
-
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6353
7013
|
|
6354
|
-
|
7014
|
+
inline void ggml_cuda_op_group_norm(
|
7015
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7016
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6355
7017
|
|
6356
|
-
|
6357
|
-
GGML_ASSERT(
|
7018
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7019
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6358
7020
|
|
6359
|
-
|
7021
|
+
int num_groups = dst->op_params[0];
|
7022
|
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
7023
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
6360
7024
|
|
6361
|
-
|
7025
|
+
(void) src1;
|
7026
|
+
(void) dst;
|
7027
|
+
(void) src1_dd;
|
7028
|
+
}
|
7029
|
+
|
7030
|
+
inline void ggml_cuda_op_concat(
|
7031
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7032
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7033
|
+
|
7034
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7035
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7036
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7037
|
+
|
7038
|
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
7039
|
+
concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
7040
|
+
}
|
7041
|
+
|
7042
|
+
(void) src1;
|
7043
|
+
(void) dst;
|
7044
|
+
}
|
7045
|
+
|
7046
|
+
inline void ggml_cuda_op_upscale(
|
7047
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7048
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7049
|
+
|
7050
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7051
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7052
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7053
|
+
|
7054
|
+
const int scale_factor = dst->op_params[0];
|
7055
|
+
|
7056
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
7057
|
+
|
7058
|
+
(void) src1;
|
7059
|
+
(void) dst;
|
7060
|
+
}
|
7061
|
+
|
7062
|
+
inline void ggml_cuda_op_pad(
|
7063
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7064
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7065
|
+
|
7066
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7067
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
7068
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
7069
|
+
|
7070
|
+
pad_f32_cuda(src0_dd, dst_dd,
|
7071
|
+
src0->ne[0], src0->ne[1], src0->ne[2],
|
7072
|
+
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
7073
|
+
|
7074
|
+
(void) src1;
|
7075
|
+
(void) dst;
|
7076
|
+
}
|
7077
|
+
|
7078
|
+
inline void ggml_cuda_op_rms_norm(
|
7079
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7080
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7081
|
+
|
7082
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7083
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7084
|
+
|
7085
|
+
const int64_t ne00 = src0->ne[0];
|
7086
|
+
const int64_t nrows = ggml_nrows(src0);
|
7087
|
+
|
7088
|
+
float eps;
|
7089
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
7090
|
+
|
7091
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
7092
|
+
|
7093
|
+
(void) src1;
|
7094
|
+
(void) dst;
|
7095
|
+
(void) src1_dd;
|
7096
|
+
}
|
7097
|
+
|
7098
|
+
inline void ggml_cuda_op_mul_mat_q(
|
7099
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7100
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7101
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
7102
|
+
|
7103
|
+
const int64_t ne00 = src0->ne[0];
|
7104
|
+
|
7105
|
+
const int64_t ne10 = src1->ne[0];
|
7106
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
7107
|
+
|
7108
|
+
const int64_t ne0 = dst->ne[0];
|
7109
|
+
|
7110
|
+
const int64_t row_diff = row_high - row_low;
|
6362
7111
|
|
6363
7112
|
int id;
|
6364
7113
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -6474,6 +7223,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6474
7223
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6475
7224
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6476
7225
|
|
7226
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
7227
|
+
|
6477
7228
|
const int64_t ne00 = src0->ne[0];
|
6478
7229
|
const int64_t row_diff = row_high - row_low;
|
6479
7230
|
|
@@ -6533,7 +7284,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6533
7284
|
size_t ash;
|
6534
7285
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6535
7286
|
|
6536
|
-
bool src1_convert_f16 =
|
7287
|
+
bool src1_convert_f16 =
|
7288
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6537
7289
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6538
7290
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6539
7291
|
|
@@ -6837,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
|
|
6837
7589
|
|
6838
7590
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6839
7591
|
|
6840
|
-
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6841
7592
|
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6842
7593
|
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6843
7594
|
const int64_t IW = src1->ne[0];
|
@@ -6848,17 +7599,51 @@ inline void ggml_cuda_op_im2col(
|
|
6848
7599
|
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6849
7600
|
const int64_t OW = dst->ne[1];
|
6850
7601
|
|
6851
|
-
const size_t
|
6852
|
-
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
7602
|
+
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6853
7603
|
|
6854
|
-
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6855
|
-
OH, IW, IH, OW, IC, KH, KW, N,
|
6856
|
-
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
7604
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
6857
7605
|
|
6858
7606
|
(void) src0;
|
6859
7607
|
(void) src0_dd;
|
6860
7608
|
}
|
6861
7609
|
|
7610
|
+
|
7611
|
+
inline void ggml_cuda_op_sum_rows(
|
7612
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7613
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7614
|
+
|
7615
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7616
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7617
|
+
|
7618
|
+
const int64_t ncols = src0->ne[0];
|
7619
|
+
const int64_t nrows = ggml_nrows(src0);
|
7620
|
+
|
7621
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7622
|
+
|
7623
|
+
(void) src1;
|
7624
|
+
(void) dst;
|
7625
|
+
(void) src1_dd;
|
7626
|
+
}
|
7627
|
+
|
7628
|
+
inline void ggml_cuda_op_argsort(
|
7629
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7630
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7631
|
+
|
7632
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7633
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7634
|
+
|
7635
|
+
const int64_t ncols = src0->ne[0];
|
7636
|
+
const int64_t nrows = ggml_nrows(src0);
|
7637
|
+
|
7638
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7639
|
+
|
7640
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7641
|
+
|
7642
|
+
(void) src1;
|
7643
|
+
(void) dst;
|
7644
|
+
(void) src1_dd;
|
7645
|
+
}
|
7646
|
+
|
6862
7647
|
inline void ggml_cuda_op_diag_mask_inf(
|
6863
7648
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6864
7649
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7067,7 +7852,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7067
7852
|
const int64_t ne01 = src0->ne[1];
|
7068
7853
|
const int64_t ne02 = src0->ne[2];
|
7069
7854
|
const int64_t ne03 = src0->ne[3];
|
7070
|
-
|
7855
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7071
7856
|
|
7072
7857
|
const int64_t ne10 = src1->ne[0];
|
7073
7858
|
const int64_t ne11 = src1->ne[1];
|
@@ -7103,10 +7888,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7103
7888
|
|
7104
7889
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7105
7890
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7106
|
-
|
7107
7891
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7108
|
-
|
7109
|
-
|
7892
|
+
|
7893
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7110
7894
|
|
7111
7895
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7112
7896
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7231,7 +8015,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7231
8015
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7232
8016
|
|
7233
8017
|
// for split tensors the data begins at i0 == i0_offset_low
|
7234
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
8018
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7235
8019
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7236
8020
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7237
8021
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7372,10 +8156,18 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7372
8156
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
7373
8157
|
}
|
7374
8158
|
|
8159
|
+
static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8160
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
|
8161
|
+
}
|
8162
|
+
|
7375
8163
|
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7376
8164
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7377
8165
|
}
|
7378
8166
|
|
8167
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8168
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
8169
|
+
}
|
8170
|
+
|
7379
8171
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7380
8172
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7381
8173
|
}
|
@@ -7384,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7384
8176
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7385
8177
|
}
|
7386
8178
|
|
8179
|
+
static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8180
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
|
8181
|
+
}
|
8182
|
+
|
8183
|
+
static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8184
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
|
8185
|
+
}
|
8186
|
+
|
7387
8187
|
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7388
8188
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7389
8189
|
}
|
7390
8190
|
|
8191
|
+
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8192
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
8193
|
+
}
|
8194
|
+
|
7391
8195
|
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7392
8196
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7393
8197
|
}
|
@@ -7396,12 +8200,28 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7396
8200
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7397
8201
|
}
|
7398
8202
|
|
8203
|
+
static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8204
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
|
8205
|
+
}
|
8206
|
+
|
8207
|
+
static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8208
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
|
8209
|
+
}
|
8210
|
+
|
8211
|
+
static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8212
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
|
8213
|
+
}
|
8214
|
+
|
8215
|
+
static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8216
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
8217
|
+
}
|
8218
|
+
|
7399
8219
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7400
8220
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
7401
8221
|
}
|
7402
8222
|
|
7403
8223
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7404
|
-
if (!g_cublas_loaded)
|
8224
|
+
if (!g_cublas_loaded) return false;
|
7405
8225
|
|
7406
8226
|
const int64_t ne10 = src1->ne[0];
|
7407
8227
|
|
@@ -7479,7 +8299,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7479
8299
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7480
8300
|
}
|
7481
8301
|
|
7482
|
-
__global__
|
8302
|
+
static __global__ void k_compute_batched_ptrs(
|
7483
8303
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7484
8304
|
const void ** ptrs_src, void ** ptrs_dst,
|
7485
8305
|
int ne12, int ne13,
|
@@ -7535,9 +8355,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7535
8355
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7536
8356
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7537
8357
|
|
7538
|
-
|
7539
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7540
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
8358
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7541
8359
|
|
7542
8360
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7543
8361
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7594,7 +8412,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7594
8412
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7595
8413
|
// use cublasGemmStridedBatchedEx
|
7596
8414
|
CUBLAS_CHECK(
|
7597
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
8415
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7598
8416
|
ne01, ne11, ne10,
|
7599
8417
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7600
8418
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7628,7 +8446,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7628
8446
|
CUDA_CHECK(cudaGetLastError());
|
7629
8447
|
|
7630
8448
|
CUBLAS_CHECK(
|
7631
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
8449
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7632
8450
|
ne01, ne11, ne10,
|
7633
8451
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7634
8452
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7698,10 +8516,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7698
8516
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7699
8517
|
const bool use_mul_mat_vec_q = false;
|
7700
8518
|
#else
|
7701
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8519
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7702
8520
|
#endif // GGML_CUDA_FORCE_DMMV
|
7703
8521
|
|
7704
8522
|
if (use_mul_mat_vec_q) {
|
8523
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7705
8524
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7706
8525
|
} else {
|
7707
8526
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7726,6 +8545,252 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7726
8545
|
}
|
7727
8546
|
}
|
7728
8547
|
|
8548
|
+
#if 0
|
8549
|
+
template<typename ... Srcs>
|
8550
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8551
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8552
|
+
int ne12, int ne13,
|
8553
|
+
int ne23,
|
8554
|
+
int nb02, int nb03,
|
8555
|
+
int nb12, int nb13,
|
8556
|
+
int nb2, int nb3,
|
8557
|
+
int r2, int r3,
|
8558
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8559
|
+
const half * src1_f16, half * dst_f16,
|
8560
|
+
const int32_t * ids, const int id,
|
8561
|
+
Srcs... src0s) {
|
8562
|
+
|
8563
|
+
int i = ids[id];
|
8564
|
+
|
8565
|
+
half * src0_f16;
|
8566
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8567
|
+
if (src0_type == GGML_TYPE_F16) {
|
8568
|
+
src0_f16 = (half *) srcs_ar[i];
|
8569
|
+
} else {
|
8570
|
+
src0_f16 = src0_as_f16;
|
8571
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8572
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8573
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8574
|
+
}
|
8575
|
+
}
|
8576
|
+
|
8577
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8578
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8579
|
+
|
8580
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8581
|
+
return;
|
8582
|
+
}
|
8583
|
+
|
8584
|
+
int i03 = i13 / r3;
|
8585
|
+
int i02 = i12 / r2;
|
8586
|
+
|
8587
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8588
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8589
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
8590
|
+
}
|
8591
|
+
|
8592
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8593
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8594
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8595
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
8596
|
+
|
8597
|
+
const int id = dst->op_params[0];
|
8598
|
+
|
8599
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8600
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
8601
|
+
|
8602
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8603
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8604
|
+
|
8605
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8606
|
+
const int64_t ne01 = src00->ne[1];
|
8607
|
+
const int64_t ne02 = src00->ne[2];
|
8608
|
+
const int64_t ne03 = src00->ne[3];
|
8609
|
+
|
8610
|
+
//const int64_t nb01 = src00->nb[1];
|
8611
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8612
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
8613
|
+
|
8614
|
+
const int64_t ne10 = src1->ne[0];
|
8615
|
+
const int64_t ne11 = src1->ne[1];
|
8616
|
+
const int64_t ne12 = src1->ne[2];
|
8617
|
+
const int64_t ne13 = src1->ne[3];
|
8618
|
+
|
8619
|
+
//const int64_t nb11 = src1->nb[1];
|
8620
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8621
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8622
|
+
|
8623
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8624
|
+
const int64_t ne = ggml_nelements(dst);
|
8625
|
+
|
8626
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8627
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8628
|
+
|
8629
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8630
|
+
|
8631
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8632
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8633
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8634
|
+
|
8635
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8636
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8637
|
+
|
8638
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8639
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8640
|
+
|
8641
|
+
// convert src1 to fp16
|
8642
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8643
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8644
|
+
|
8645
|
+
size_t src1_as = 0;
|
8646
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8647
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8648
|
+
|
8649
|
+
size_t dst_as = 0;
|
8650
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8651
|
+
|
8652
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8653
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8654
|
+
|
8655
|
+
// broadcast factors
|
8656
|
+
const int64_t r2 = ne12/ne02;
|
8657
|
+
const int64_t r3 = ne13/ne03;
|
8658
|
+
|
8659
|
+
const half alpha_f16 = 1.0f;
|
8660
|
+
const half beta_f16 = 0.0f;
|
8661
|
+
|
8662
|
+
// use cublasGemmBatchedEx
|
8663
|
+
const int ne23 = ne12*ne13;
|
8664
|
+
|
8665
|
+
const void ** ptrs_src = nullptr;
|
8666
|
+
void ** ptrs_dst = nullptr;
|
8667
|
+
|
8668
|
+
size_t ptrs_src_s = 0;
|
8669
|
+
size_t ptrs_dst_s = 0;
|
8670
|
+
|
8671
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8672
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8673
|
+
|
8674
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8675
|
+
half * src0_as_f16 = nullptr;
|
8676
|
+
size_t src0_as = 0;
|
8677
|
+
if (src00->type != GGML_TYPE_F16) {
|
8678
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8679
|
+
}
|
8680
|
+
|
8681
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8682
|
+
dim3 block_dims(ne13, ne12);
|
8683
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8684
|
+
ptrs_src, ptrs_dst,
|
8685
|
+
ne12, ne13,
|
8686
|
+
ne23,
|
8687
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8688
|
+
nb12, nb13,
|
8689
|
+
dst->nb[2], dst->nb[3],
|
8690
|
+
r2, r3,
|
8691
|
+
src00->type, src0_as_f16, src0_ne,
|
8692
|
+
src1_as_f16, dst_f16,
|
8693
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8694
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8695
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8696
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8697
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8698
|
+
);
|
8699
|
+
CUDA_CHECK(cudaGetLastError());
|
8700
|
+
|
8701
|
+
CUBLAS_CHECK(
|
8702
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8703
|
+
ne01, ne11, ne10,
|
8704
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8705
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8706
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8707
|
+
ne23,
|
8708
|
+
CUBLAS_COMPUTE_16F,
|
8709
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8710
|
+
|
8711
|
+
if (src0_as != 0) {
|
8712
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8713
|
+
}
|
8714
|
+
if (ptrs_src_s != 0) {
|
8715
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8716
|
+
}
|
8717
|
+
if (ptrs_dst_s != 0) {
|
8718
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8722
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8723
|
+
|
8724
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8725
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8726
|
+
}
|
8727
|
+
#endif
|
8728
|
+
|
8729
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8730
|
+
#if 0
|
8731
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8732
|
+
// TODO: mmq/mmv support
|
8733
|
+
#endif
|
8734
|
+
|
8735
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8736
|
+
|
8737
|
+
const struct ggml_tensor * ids = src0;
|
8738
|
+
const int32_t id = ((int32_t *) dst->op_params)[0];
|
8739
|
+
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
8740
|
+
|
8741
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
8742
|
+
|
8743
|
+
if (ids->backend == GGML_BACKEND_GPU) {
|
8744
|
+
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8745
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8746
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8747
|
+
} else {
|
8748
|
+
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8749
|
+
}
|
8750
|
+
|
8751
|
+
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
8752
|
+
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
8753
|
+
|
8754
|
+
ggml_tensor_extra_gpu src1_row_extra;
|
8755
|
+
ggml_tensor_extra_gpu dst_row_extra;
|
8756
|
+
|
8757
|
+
ggml_tensor src1_row = *src1;
|
8758
|
+
ggml_tensor dst_row = *dst;
|
8759
|
+
|
8760
|
+
src1_row.ne[1] = 1;
|
8761
|
+
dst_row.ne[1] = 1;
|
8762
|
+
|
8763
|
+
src1_row.nb[2] = src1_row.nb[1];
|
8764
|
+
dst_row.nb[2] = dst_row.nb[1];
|
8765
|
+
|
8766
|
+
src1_row.nb[3] = src1_row.nb[1];
|
8767
|
+
dst_row.nb[3] = dst_row.nb[1];
|
8768
|
+
|
8769
|
+
src1_row.extra = &src1_row_extra;
|
8770
|
+
dst_row.extra = &dst_row_extra;
|
8771
|
+
|
8772
|
+
|
8773
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8774
|
+
//int32_t row_id;
|
8775
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8776
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8777
|
+
|
8778
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8779
|
+
|
8780
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8781
|
+
|
8782
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8783
|
+
|
8784
|
+
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
8785
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
8786
|
+
|
8787
|
+
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
8788
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1];
|
8789
|
+
|
8790
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8791
|
+
}
|
8792
|
+
}
|
8793
|
+
|
7729
8794
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7730
8795
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7731
8796
|
}
|
@@ -7770,14 +8835,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7770
8835
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7771
8836
|
|
7772
8837
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7773
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7774
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8838
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7775
8839
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7776
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7777
|
-
|
8840
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8841
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8842
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8843
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8844
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8845
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8846
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7778
8847
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7779
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7780
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8848
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7781
8849
|
} else {
|
7782
8850
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7783
8851
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7788,6 +8856,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7788
8856
|
}
|
7789
8857
|
|
7790
8858
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8859
|
+
// TODO: why do we pass dst as src1 here?
|
7791
8860
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7792
8861
|
(void) src1;
|
7793
8862
|
}
|
@@ -7813,12 +8882,28 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7813
8882
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7814
8883
|
}
|
7815
8884
|
|
8885
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8886
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8887
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8888
|
+
}
|
8889
|
+
|
8890
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8891
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8892
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8893
|
+
}
|
8894
|
+
|
7816
8895
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7817
8896
|
(void) src0;
|
7818
8897
|
(void) src1;
|
7819
8898
|
(void) dst;
|
7820
8899
|
}
|
7821
8900
|
|
8901
|
+
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
8902
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
8903
|
+
|
8904
|
+
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
8905
|
+
}
|
8906
|
+
|
7822
8907
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
7823
8908
|
const int64_t nrows = ggml_nrows(tensor);
|
7824
8909
|
|
@@ -7868,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7868
8953
|
|
7869
8954
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
7870
8955
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7871
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7872
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8956
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
7873
8957
|
}
|
7874
8958
|
|
7875
8959
|
char * buf;
|
@@ -8068,8 +9152,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8068
9152
|
main_device, g_device_count, g_main_device);
|
8069
9153
|
return;
|
8070
9154
|
}
|
8071
|
-
|
8072
|
-
if (g_device_count > 1) {
|
9155
|
+
|
9156
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
9157
|
+
g_main_device = main_device;
|
8073
9158
|
cudaDeviceProp prop;
|
8074
9159
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8075
9160
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8095,7 +9180,7 @@ void ggml_cuda_free_scratch() {
|
|
8095
9180
|
}
|
8096
9181
|
|
8097
9182
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8098
|
-
if (!g_cublas_loaded)
|
9183
|
+
if (!g_cublas_loaded) return false;
|
8099
9184
|
|
8100
9185
|
ggml_cuda_func_t func;
|
8101
9186
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8128,9 +9213,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8128
9213
|
case GGML_OP_ADD:
|
8129
9214
|
func = ggml_cuda_add;
|
8130
9215
|
break;
|
9216
|
+
case GGML_OP_ACC:
|
9217
|
+
func = ggml_cuda_acc;
|
9218
|
+
break;
|
8131
9219
|
case GGML_OP_MUL:
|
8132
9220
|
func = ggml_cuda_mul;
|
8133
9221
|
break;
|
9222
|
+
case GGML_OP_DIV:
|
9223
|
+
func = ggml_cuda_div;
|
9224
|
+
break;
|
8134
9225
|
case GGML_OP_UNARY:
|
8135
9226
|
switch (ggml_get_unary_op(tensor)) {
|
8136
9227
|
case GGML_UNARY_OP_GELU:
|
@@ -8139,15 +9230,37 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8139
9230
|
case GGML_UNARY_OP_SILU:
|
8140
9231
|
func = ggml_cuda_silu;
|
8141
9232
|
break;
|
9233
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9234
|
+
func = ggml_cuda_gelu_quick;
|
9235
|
+
break;
|
9236
|
+
case GGML_UNARY_OP_TANH:
|
9237
|
+
func = ggml_cuda_tanh;
|
9238
|
+
break;
|
8142
9239
|
case GGML_UNARY_OP_RELU:
|
8143
9240
|
func = ggml_cuda_relu;
|
8144
9241
|
break;
|
8145
9242
|
default:
|
8146
9243
|
return false;
|
8147
|
-
}
|
9244
|
+
}
|
9245
|
+
break;
|
8148
9246
|
case GGML_OP_NORM:
|
8149
9247
|
func = ggml_cuda_norm;
|
8150
9248
|
break;
|
9249
|
+
case GGML_OP_GROUP_NORM:
|
9250
|
+
func = ggml_cuda_group_norm;
|
9251
|
+
break;
|
9252
|
+
case GGML_OP_CONCAT:
|
9253
|
+
func = ggml_cuda_concat;
|
9254
|
+
break;
|
9255
|
+
case GGML_OP_UPSCALE:
|
9256
|
+
func = ggml_cuda_upscale;
|
9257
|
+
break;
|
9258
|
+
case GGML_OP_PAD:
|
9259
|
+
func = ggml_cuda_pad;
|
9260
|
+
break;
|
9261
|
+
case GGML_OP_LEAKY_RELU:
|
9262
|
+
func = ggml_cuda_leaky_relu;
|
9263
|
+
break;
|
8151
9264
|
case GGML_OP_RMS_NORM:
|
8152
9265
|
func = ggml_cuda_rms_norm;
|
8153
9266
|
break;
|
@@ -8157,6 +9270,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8157
9270
|
}
|
8158
9271
|
func = ggml_cuda_mul_mat;
|
8159
9272
|
break;
|
9273
|
+
case GGML_OP_MUL_MAT_ID:
|
9274
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
9275
|
+
return false;
|
9276
|
+
}
|
9277
|
+
func = ggml_cuda_mul_mat_id;
|
9278
|
+
break;
|
8160
9279
|
case GGML_OP_SCALE:
|
8161
9280
|
func = ggml_cuda_scale;
|
8162
9281
|
break;
|
@@ -8164,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8164
9283
|
func = ggml_cuda_sqr;
|
8165
9284
|
break;
|
8166
9285
|
case GGML_OP_CLAMP:
|
8167
|
-
if (!any_on_device) {
|
8168
|
-
return false;
|
8169
|
-
}
|
8170
9286
|
func = ggml_cuda_clamp;
|
8171
9287
|
break;
|
8172
9288
|
case GGML_OP_CPY:
|
@@ -8175,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8175
9291
|
case GGML_OP_CONT:
|
8176
9292
|
func = ggml_cuda_dup;
|
8177
9293
|
break;
|
9294
|
+
case GGML_OP_NONE:
|
8178
9295
|
case GGML_OP_RESHAPE:
|
8179
9296
|
case GGML_OP_VIEW:
|
8180
9297
|
case GGML_OP_PERMUTE:
|
@@ -8196,6 +9313,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8196
9313
|
case GGML_OP_IM2COL:
|
8197
9314
|
func = ggml_cuda_im2col;
|
8198
9315
|
break;
|
9316
|
+
case GGML_OP_SUM_ROWS:
|
9317
|
+
func = ggml_cuda_sum_rows;
|
9318
|
+
break;
|
9319
|
+
case GGML_OP_ARGSORT:
|
9320
|
+
func = ggml_cuda_argsort;
|
9321
|
+
break;
|
8199
9322
|
default:
|
8200
9323
|
return false;
|
8201
9324
|
}
|
@@ -8212,7 +9335,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8212
9335
|
|
8213
9336
|
int ggml_cuda_get_device_count() {
|
8214
9337
|
int device_count;
|
8215
|
-
|
9338
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
9339
|
+
return 0;
|
9340
|
+
}
|
8216
9341
|
return device_count;
|
8217
9342
|
}
|
8218
9343
|
|
@@ -8228,27 +9353,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8228
9353
|
|
8229
9354
|
#define UNUSED GGML_UNUSED
|
8230
9355
|
|
8231
|
-
|
8232
|
-
};
|
8233
|
-
|
8234
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8235
|
-
return GGML_CUDA_NAME;
|
8236
|
-
|
8237
|
-
UNUSED(backend);
|
8238
|
-
}
|
8239
|
-
|
8240
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8241
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8242
|
-
delete cuda_ctx;
|
8243
|
-
delete backend;
|
8244
|
-
}
|
9356
|
+
// cuda buffer
|
8245
9357
|
|
8246
9358
|
struct ggml_backend_buffer_context_cuda {
|
8247
|
-
|
8248
|
-
|
9359
|
+
int device;
|
9360
|
+
void * dev_ptr = nullptr;
|
8249
9361
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8250
9362
|
size_t temp_tensor_extra_index = 0;
|
8251
9363
|
|
9364
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
9365
|
+
|
8252
9366
|
~ggml_backend_buffer_context_cuda() {
|
8253
9367
|
delete[] temp_tensor_extras;
|
8254
9368
|
}
|
@@ -8269,41 +9383,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8269
9383
|
|
8270
9384
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8271
9385
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8272
|
-
CUDA_CHECK(cudaFree(ctx->
|
9386
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8273
9387
|
delete ctx;
|
8274
9388
|
}
|
8275
9389
|
|
8276
9390
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8277
9391
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8278
|
-
return ctx->
|
8279
|
-
}
|
8280
|
-
|
8281
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8282
|
-
int64_t row_low = 0;
|
8283
|
-
int64_t row_high = ggml_nrows(tensor);
|
8284
|
-
int64_t nrows_split = row_high - row_low;
|
8285
|
-
|
8286
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8287
|
-
|
8288
|
-
int64_t ne0 = tensor->ne[0];
|
8289
|
-
|
8290
|
-
if (ggml_is_quantized(tensor->type)) {
|
8291
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8292
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8293
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8294
|
-
}
|
8295
|
-
}
|
8296
|
-
|
8297
|
-
return size;
|
8298
|
-
|
8299
|
-
UNUSED(buffer);
|
9392
|
+
return ctx->dev_ptr;
|
8300
9393
|
}
|
8301
9394
|
|
8302
9395
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8303
9396
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8304
9397
|
|
8305
9398
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8306
|
-
assert(tensor->view_src->buffer->
|
9399
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8307
9400
|
tensor->backend = tensor->view_src->backend;
|
8308
9401
|
tensor->extra = tensor->view_src->extra;
|
8309
9402
|
return;
|
@@ -8311,7 +9404,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8311
9404
|
|
8312
9405
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8313
9406
|
|
8314
|
-
extra->data_device[
|
9407
|
+
extra->data_device[ctx->device] = tensor->data;
|
8315
9408
|
|
8316
9409
|
tensor->backend = GGML_BACKEND_GPU;
|
8317
9410
|
tensor->extra = extra;
|
@@ -8323,64 +9416,207 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8323
9416
|
int64_t nrows_split = row_high - row_low;
|
8324
9417
|
|
8325
9418
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8326
|
-
size_t padded_size =
|
9419
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8327
9420
|
|
8328
9421
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8329
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
9422
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8330
9423
|
}
|
8331
9424
|
}
|
8332
9425
|
|
8333
9426
|
UNUSED(buffer);
|
8334
9427
|
}
|
8335
9428
|
|
9429
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9430
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9431
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9432
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9433
|
+
|
9434
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
9435
|
+
|
9436
|
+
UNUSED(buffer);
|
9437
|
+
}
|
9438
|
+
|
9439
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9440
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9441
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9442
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9443
|
+
|
9444
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
9445
|
+
|
9446
|
+
UNUSED(buffer);
|
9447
|
+
}
|
9448
|
+
|
8336
9449
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8337
|
-
/* .free_buffer
|
8338
|
-
/* .get_base
|
8339
|
-
/* .
|
8340
|
-
/* .
|
8341
|
-
/* .
|
9450
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
9451
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
9452
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
9453
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
9454
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
9455
|
+
/* .cpy_tensor_from = */ NULL,
|
9456
|
+
/* .cpy_tensor_to = */ NULL,
|
8342
9457
|
};
|
8343
9458
|
|
8344
|
-
|
8345
|
-
ggml_cuda_set_device(g_main_device);
|
9459
|
+
// cuda buffer type
|
8346
9460
|
|
8347
|
-
|
9461
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
9462
|
+
int device = (int) (intptr_t) buft->context;
|
9463
|
+
|
9464
|
+
ggml_cuda_set_device(device);
|
8348
9465
|
|
8349
9466
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8350
9467
|
|
8351
|
-
|
8352
|
-
CUDA_CHECK(cudaMalloc(&
|
9468
|
+
void * dev_ptr;
|
9469
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8353
9470
|
|
8354
|
-
|
9471
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
9472
|
+
|
9473
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8355
9474
|
}
|
8356
9475
|
|
8357
|
-
static size_t
|
9476
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8358
9477
|
return 128;
|
9478
|
+
|
9479
|
+
UNUSED(buft);
|
9480
|
+
}
|
9481
|
+
|
9482
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
9483
|
+
int64_t row_low = 0;
|
9484
|
+
int64_t row_high = ggml_nrows(tensor);
|
9485
|
+
int64_t nrows_split = row_high - row_low;
|
9486
|
+
|
9487
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
9488
|
+
|
9489
|
+
int64_t ne0 = tensor->ne[0];
|
9490
|
+
|
9491
|
+
if (ggml_is_quantized(tensor->type)) {
|
9492
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
9493
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
9494
|
+
}
|
9495
|
+
}
|
9496
|
+
|
9497
|
+
return size;
|
9498
|
+
|
9499
|
+
UNUSED(buft);
|
9500
|
+
}
|
9501
|
+
|
9502
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
9503
|
+
return ggml_backend_is_cuda(backend);
|
9504
|
+
|
9505
|
+
UNUSED(buft);
|
9506
|
+
}
|
9507
|
+
|
9508
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
9509
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
9510
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
9511
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
9512
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
9513
|
+
};
|
9514
|
+
|
9515
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
9516
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
9517
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
9518
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
9519
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
9520
|
+
ggml_backend_buffer_type_cuda[i] = {
|
9521
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
9522
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
9523
|
+
};
|
9524
|
+
}
|
9525
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
9526
|
+
}
|
9527
|
+
|
9528
|
+
return &ggml_backend_buffer_type_cuda[device];
|
9529
|
+
}
|
9530
|
+
|
9531
|
+
// host buffer type
|
9532
|
+
|
9533
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
9534
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9535
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
9536
|
+
delete ctx;
|
9537
|
+
}
|
9538
|
+
|
9539
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
9540
|
+
void * ptr;
|
9541
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
9542
|
+
|
9543
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
9544
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
9545
|
+
buffer->buft = buft;
|
9546
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
9547
|
+
|
9548
|
+
return buffer;
|
9549
|
+
|
9550
|
+
UNUSED(buft);
|
9551
|
+
}
|
9552
|
+
|
9553
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9554
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9555
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9556
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9557
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9558
|
+
};
|
9559
|
+
|
9560
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9561
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9562
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9563
|
+
/* .context = */ nullptr,
|
9564
|
+
};
|
9565
|
+
|
9566
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9567
|
+
}
|
9568
|
+
|
9569
|
+
// backend
|
9570
|
+
|
9571
|
+
struct ggml_backend_context_cuda {
|
9572
|
+
int device;
|
9573
|
+
};
|
9574
|
+
|
9575
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9576
|
+
return GGML_CUDA_NAME;
|
9577
|
+
|
8359
9578
|
UNUSED(backend);
|
8360
9579
|
}
|
8361
9580
|
|
9581
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9582
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9583
|
+
|
9584
|
+
delete cuda_ctx;
|
9585
|
+
delete backend;
|
9586
|
+
}
|
9587
|
+
|
9588
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9589
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9590
|
+
|
9591
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9592
|
+
}
|
9593
|
+
|
8362
9594
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9595
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9596
|
+
|
9597
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8363
9598
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8364
9599
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8365
9600
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8366
9601
|
|
8367
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8368
|
-
|
8369
|
-
UNUSED(backend);
|
9602
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8370
9603
|
}
|
8371
9604
|
|
8372
9605
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9606
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9607
|
+
|
9608
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8373
9609
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8374
9610
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8375
9611
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8376
9612
|
|
8377
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8378
|
-
|
8379
|
-
UNUSED(backend);
|
9613
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8380
9614
|
}
|
8381
9615
|
|
8382
9616
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8383
|
-
|
9617
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9618
|
+
|
9619
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8384
9620
|
|
8385
9621
|
UNUSED(backend);
|
8386
9622
|
}
|
@@ -8394,14 +9630,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8394
9630
|
UNUSED(cgraph);
|
8395
9631
|
}
|
8396
9632
|
|
8397
|
-
|
9633
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8398
9634
|
GGML_ASSERT(!"not implemented");
|
8399
9635
|
|
8400
9636
|
UNUSED(backend);
|
8401
9637
|
UNUSED(plan);
|
8402
9638
|
}
|
8403
9639
|
|
8404
|
-
|
9640
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8405
9641
|
GGML_ASSERT(!"not implemented");
|
8406
9642
|
|
8407
9643
|
UNUSED(backend);
|
@@ -8409,7 +9645,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8409
9645
|
}
|
8410
9646
|
|
8411
9647
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8412
|
-
|
9648
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9649
|
+
|
9650
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8413
9651
|
|
8414
9652
|
ggml_compute_params params = {};
|
8415
9653
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8417,13 +9655,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8417
9655
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8418
9656
|
ggml_tensor * node = cgraph->nodes[i];
|
8419
9657
|
|
8420
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9658
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8421
9659
|
continue;
|
8422
|
-
|
9660
|
+
|
8423
9661
|
assert(node->backend == GGML_BACKEND_GPU);
|
9662
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9663
|
+
assert(node->extra != nullptr);
|
9664
|
+
|
8424
9665
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8425
9666
|
if (node->src[j] != nullptr) {
|
8426
9667
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9668
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9669
|
+
assert(node->src[j]->extra != nullptr);
|
8427
9670
|
}
|
8428
9671
|
}
|
8429
9672
|
|
@@ -8460,27 +9703,143 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8460
9703
|
UNUSED(backend);
|
8461
9704
|
}
|
8462
9705
|
|
9706
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9707
|
+
switch (op->op) {
|
9708
|
+
case GGML_OP_UNARY:
|
9709
|
+
switch (ggml_get_unary_op(op)) {
|
9710
|
+
case GGML_UNARY_OP_GELU:
|
9711
|
+
case GGML_UNARY_OP_SILU:
|
9712
|
+
case GGML_UNARY_OP_RELU:
|
9713
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
9714
|
+
case GGML_UNARY_OP_TANH:
|
9715
|
+
return true;
|
9716
|
+
default:
|
9717
|
+
return false;
|
9718
|
+
}
|
9719
|
+
break;
|
9720
|
+
case GGML_OP_MUL_MAT:
|
9721
|
+
case GGML_OP_MUL_MAT_ID:
|
9722
|
+
{
|
9723
|
+
struct ggml_tensor * a;
|
9724
|
+
struct ggml_tensor * b;
|
9725
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9726
|
+
a = op->src[0];
|
9727
|
+
b = op->src[1];
|
9728
|
+
} else {
|
9729
|
+
a = op->src[2];
|
9730
|
+
b = op->src[1];
|
9731
|
+
}
|
9732
|
+
if (a->ne[3] != b->ne[3]) {
|
9733
|
+
return false;
|
9734
|
+
}
|
9735
|
+
return true;
|
9736
|
+
} break;
|
9737
|
+
case GGML_OP_GET_ROWS:
|
9738
|
+
{
|
9739
|
+
switch (op->src[0]->type) {
|
9740
|
+
case GGML_TYPE_F16:
|
9741
|
+
case GGML_TYPE_F32:
|
9742
|
+
case GGML_TYPE_Q4_0:
|
9743
|
+
case GGML_TYPE_Q4_1:
|
9744
|
+
case GGML_TYPE_Q5_0:
|
9745
|
+
case GGML_TYPE_Q5_1:
|
9746
|
+
case GGML_TYPE_Q8_0:
|
9747
|
+
return true;
|
9748
|
+
default:
|
9749
|
+
return false;
|
9750
|
+
}
|
9751
|
+
} break;
|
9752
|
+
case GGML_OP_CPY:
|
9753
|
+
{
|
9754
|
+
ggml_type src0_type = op->src[0]->type;
|
9755
|
+
ggml_type src1_type = op->src[1]->type;
|
9756
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
9757
|
+
return true;
|
9758
|
+
}
|
9759
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
9760
|
+
return true;
|
9761
|
+
}
|
9762
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
9763
|
+
return true;
|
9764
|
+
}
|
9765
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
9766
|
+
return true;
|
9767
|
+
}
|
9768
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
|
9769
|
+
return true;
|
9770
|
+
}
|
9771
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
9772
|
+
return true;
|
9773
|
+
}
|
9774
|
+
return false;
|
9775
|
+
} break;
|
9776
|
+
case GGML_OP_NONE:
|
9777
|
+
case GGML_OP_RESHAPE:
|
9778
|
+
case GGML_OP_VIEW:
|
9779
|
+
case GGML_OP_PERMUTE:
|
9780
|
+
case GGML_OP_TRANSPOSE:
|
9781
|
+
case GGML_OP_NORM:
|
9782
|
+
case GGML_OP_REPEAT:
|
9783
|
+
case GGML_OP_DUP:
|
9784
|
+
case GGML_OP_ADD:
|
9785
|
+
case GGML_OP_MUL:
|
9786
|
+
case GGML_OP_DIV:
|
9787
|
+
case GGML_OP_RMS_NORM:
|
9788
|
+
case GGML_OP_SCALE:
|
9789
|
+
case GGML_OP_SQR:
|
9790
|
+
case GGML_OP_CLAMP:
|
9791
|
+
case GGML_OP_CONT:
|
9792
|
+
case GGML_OP_DIAG_MASK_INF:
|
9793
|
+
case GGML_OP_SOFT_MAX:
|
9794
|
+
case GGML_OP_ROPE:
|
9795
|
+
case GGML_OP_ALIBI:
|
9796
|
+
case GGML_OP_IM2COL:
|
9797
|
+
case GGML_OP_SUM_ROWS:
|
9798
|
+
case GGML_OP_ARGSORT:
|
9799
|
+
case GGML_OP_ACC:
|
9800
|
+
case GGML_OP_CONCAT:
|
9801
|
+
case GGML_OP_GROUP_NORM:
|
9802
|
+
case GGML_OP_UPSCALE:
|
9803
|
+
case GGML_OP_PAD:
|
9804
|
+
case GGML_OP_LEAKY_RELU:
|
9805
|
+
return true;
|
9806
|
+
default:
|
9807
|
+
return false;
|
9808
|
+
}
|
9809
|
+
|
9810
|
+
UNUSED(backend);
|
9811
|
+
}
|
9812
|
+
|
8463
9813
|
static ggml_backend_i cuda_backend_i = {
|
8464
|
-
/* .get_name
|
8465
|
-
/* .free
|
8466
|
-
/* .
|
8467
|
-
/* .
|
8468
|
-
/* .
|
8469
|
-
/* .
|
8470
|
-
/* .
|
8471
|
-
/* .
|
8472
|
-
/* .
|
8473
|
-
/* .
|
8474
|
-
/* .
|
8475
|
-
/* .
|
8476
|
-
/* .
|
8477
|
-
/* .supports_op = */ nullptr,
|
9814
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9815
|
+
/* .free = */ ggml_backend_cuda_free,
|
9816
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9817
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9818
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9819
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9820
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9821
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9822
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9823
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9824
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9825
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9826
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8478
9827
|
};
|
8479
9828
|
|
8480
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9829
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8481
9830
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8482
9831
|
|
8483
|
-
|
9832
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9833
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9834
|
+
return nullptr;
|
9835
|
+
}
|
9836
|
+
|
9837
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9838
|
+
ggml_cuda_set_main_device(device);
|
9839
|
+
|
9840
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9841
|
+
/* .device = */ device
|
9842
|
+
};
|
8484
9843
|
|
8485
9844
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8486
9845
|
/* .interface = */ cuda_backend_i,
|
@@ -8489,3 +9848,27 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8489
9848
|
|
8490
9849
|
return cuda_backend;
|
8491
9850
|
}
|
9851
|
+
|
9852
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9853
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9854
|
+
}
|
9855
|
+
|
9856
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9857
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9858
|
+
return cuda_backend;
|
9859
|
+
|
9860
|
+
UNUSED(params);
|
9861
|
+
}
|
9862
|
+
|
9863
|
+
extern "C" int ggml_backend_cuda_reg_devices();
|
9864
|
+
|
9865
|
+
int ggml_backend_cuda_reg_devices() {
|
9866
|
+
int device_count = ggml_cuda_get_device_count();
|
9867
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9868
|
+
for (int i = 0; i < device_count; i++) {
|
9869
|
+
char name[128];
|
9870
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9871
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9872
|
+
}
|
9873
|
+
return device_count;
|
9874
|
+
}
|