llama_cpp 0.9.5 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
@@ -1,7 +1,8 @@
|
|
1
1
|
#include <algorithm>
|
2
|
-
#include <cinttypes>
|
3
2
|
#include <cstddef>
|
4
3
|
#include <cstdint>
|
4
|
+
#include <cinttypes>
|
5
|
+
#include <float.h>
|
5
6
|
#include <limits>
|
6
7
|
#include <stdint.h>
|
7
8
|
#include <stdio.h>
|
@@ -69,6 +70,7 @@
|
|
69
70
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
71
|
#define cudaSetDevice hipSetDevice
|
71
72
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
73
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
74
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
75
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
76
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
192
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
193
|
cudaGetErrorString(err_)); \
|
192
194
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
195
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
196
|
} \
|
195
197
|
} while (0)
|
196
198
|
|
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
206
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
207
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
208
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
209
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
210
|
} \
|
209
211
|
} while (0)
|
210
212
|
#else
|
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
218
|
cudaGetDevice(&id); \
|
217
219
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
220
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
221
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
222
|
} \
|
221
223
|
} while (0)
|
222
224
|
#endif // CUDART_VERSION >= 11
|
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
435
|
#define WARP_SIZE 32
|
434
436
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
437
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
440
|
#define CUDA_RELU_BLOCK_SIZE 256
|
@@ -527,40 +527,87 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
527
527
|
return x;
|
528
528
|
}
|
529
529
|
|
530
|
-
static
|
531
|
-
|
530
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
531
|
+
return b;
|
532
|
+
}
|
532
533
|
|
533
|
-
|
534
|
-
|
535
|
-
}
|
536
|
-
dst[i] = x[i] + y[i%ky];
|
534
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
535
|
+
return a + b;
|
537
536
|
}
|
538
537
|
|
539
|
-
static
|
540
|
-
|
538
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
539
|
+
return a * b;
|
540
|
+
}
|
541
541
|
|
542
|
-
|
543
|
-
|
544
|
-
}
|
545
|
-
dst[i] = __hadd(x[i], __float2half(y[i]));
|
542
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
543
|
+
return a / b;
|
546
544
|
}
|
547
545
|
|
548
|
-
|
549
|
-
|
546
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
547
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
548
|
+
int ne0, int ne1, int ne2, int ne3,
|
549
|
+
int ne10, int ne11, int ne12, int ne13,
|
550
|
+
/*int s0, */ int s1, int s2, int s3,
|
551
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
552
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
553
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
554
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
555
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
550
556
|
|
551
|
-
if (
|
557
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
552
558
|
return;
|
553
559
|
}
|
554
|
-
|
560
|
+
|
561
|
+
const int i11 = i1 % ne11;
|
562
|
+
const int i12 = i2 % ne12;
|
563
|
+
const int i13 = i3 % ne13;
|
564
|
+
|
565
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
566
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
567
|
+
const size_t i_dst = i_src0;
|
568
|
+
|
569
|
+
const src0_t * src0_row = src0 + i_src0;
|
570
|
+
const src1_t * src1_row = src1 + i_src1;
|
571
|
+
dst_t * dst_row = dst + i_dst;
|
572
|
+
|
573
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
574
|
+
const int i10 = i0 % ne10;
|
575
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
576
|
+
}
|
555
577
|
}
|
556
578
|
|
557
|
-
|
579
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
580
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
581
|
+
int ne0, int ne1, int ne2, int ne3,
|
582
|
+
int ne10, int ne11, int ne12, int ne13,
|
583
|
+
/*int s0, */ int s1, int s2, int s3,
|
584
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
585
|
+
|
558
586
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
559
587
|
|
560
|
-
|
588
|
+
const int i3 = i/(ne2*ne1*ne0);
|
589
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
590
|
+
const int i1 = (i/ne0) % ne1;
|
591
|
+
const int i0 = i % ne0;
|
592
|
+
|
593
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
561
594
|
return;
|
562
595
|
}
|
563
|
-
|
596
|
+
|
597
|
+
const int i11 = i1 % ne11;
|
598
|
+
const int i12 = i2 % ne12;
|
599
|
+
const int i13 = i3 % ne13;
|
600
|
+
|
601
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
602
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
603
|
+
const size_t i_dst = i_src0;
|
604
|
+
|
605
|
+
const src0_t * src0_row = src0 + i_src0;
|
606
|
+
const src1_t * src1_row = src1 + i_src1;
|
607
|
+
dst_t * dst_row = dst + i_dst;
|
608
|
+
|
609
|
+
const int i10 = i0 % ne10;
|
610
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
564
611
|
}
|
565
612
|
|
566
613
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -604,12 +651,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
604
651
|
}
|
605
652
|
|
606
653
|
template <int block_size>
|
607
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
654
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
608
655
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
609
656
|
const int tid = threadIdx.x;
|
610
657
|
|
611
|
-
const float eps = 1e-5f;
|
612
|
-
|
613
658
|
float2 mean_var = make_float2(0.f, 0.f);
|
614
659
|
|
615
660
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -4559,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4559
4604
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4560
4605
|
}
|
4561
4606
|
|
4607
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4608
|
+
const float * xi = (const float *) cxi;
|
4609
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4610
|
+
|
4611
|
+
float amax = 0.0f; // absolute max
|
4612
|
+
|
4613
|
+
for (int j = 0; j < QK8_0; j++) {
|
4614
|
+
const float v = xi[j];
|
4615
|
+
amax = fmaxf(amax, fabsf(v));
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const float d = amax / ((1 << 7) - 1);
|
4619
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4620
|
+
|
4621
|
+
dsti->d = d;
|
4622
|
+
|
4623
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4624
|
+
const float x0 = xi[j]*id;
|
4625
|
+
|
4626
|
+
dsti->qs[j] = roundf(x0);
|
4627
|
+
}
|
4628
|
+
}
|
4629
|
+
|
4630
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4631
|
+
const float * xi = (const float *) cxi;
|
4632
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4633
|
+
|
4634
|
+
float amax = 0.0f;
|
4635
|
+
float vmax = 0.0f;
|
4636
|
+
|
4637
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4638
|
+
const float v = xi[j];
|
4639
|
+
if (amax < fabsf(v)) {
|
4640
|
+
amax = fabsf(v);
|
4641
|
+
vmax = v;
|
4642
|
+
}
|
4643
|
+
}
|
4644
|
+
|
4645
|
+
const float d = vmax / -8;
|
4646
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4647
|
+
|
4648
|
+
dsti->d = d;
|
4649
|
+
|
4650
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4651
|
+
const float x0 = xi[0 + j]*id;
|
4652
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4653
|
+
|
4654
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4655
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4656
|
+
|
4657
|
+
dsti->qs[j] = xi0;
|
4658
|
+
dsti->qs[j] |= xi1 << 4;
|
4659
|
+
}
|
4660
|
+
}
|
4661
|
+
|
4662
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4663
|
+
const float * xi = (const float *) cxi;
|
4664
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4665
|
+
|
4666
|
+
float vmin = FLT_MAX;
|
4667
|
+
float vmax = -FLT_MAX;
|
4668
|
+
|
4669
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4670
|
+
const float v = xi[j];
|
4671
|
+
|
4672
|
+
if (v < vmin) vmin = v;
|
4673
|
+
if (v > vmax) vmax = v;
|
4674
|
+
}
|
4675
|
+
|
4676
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4677
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4678
|
+
|
4679
|
+
dsti->dm.x = d;
|
4680
|
+
dsti->dm.y = vmin;
|
4681
|
+
|
4682
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4683
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4684
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4685
|
+
|
4686
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4687
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4688
|
+
|
4689
|
+
dsti->qs[j] = xi0;
|
4690
|
+
dsti->qs[j] |= xi1 << 4;
|
4691
|
+
}
|
4692
|
+
}
|
4693
|
+
|
4694
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4695
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4696
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4697
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4698
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4699
|
+
|
4700
|
+
if (i >= ne) {
|
4701
|
+
return;
|
4702
|
+
}
|
4703
|
+
|
4704
|
+
const int i02 = i / (ne00*ne01);
|
4705
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4706
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4707
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4708
|
+
|
4709
|
+
const int i12 = i / (ne10*ne11);
|
4710
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4711
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4712
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4713
|
+
|
4714
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4715
|
+
}
|
4716
|
+
|
4562
4717
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4563
4718
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4564
4719
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4713,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4713
4868
|
dst[i] = col * m_k + x[i];
|
4714
4869
|
}
|
4715
4870
|
|
4871
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
4872
|
+
const int row = blockIdx.y;
|
4873
|
+
const int col = threadIdx.x;
|
4874
|
+
|
4875
|
+
float sum = 0.0f;
|
4876
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
4877
|
+
sum += x[row * ncols + i];
|
4878
|
+
}
|
4879
|
+
|
4880
|
+
sum = warp_reduce_sum(sum);
|
4881
|
+
|
4882
|
+
if (col == 0) {
|
4883
|
+
dst[row] = sum;
|
4884
|
+
}
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
template<typename T>
|
4888
|
+
static inline __device__ void swap(T & a, T & b) {
|
4889
|
+
T tmp = a;
|
4890
|
+
a = b;
|
4891
|
+
b = tmp;
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
template<ggml_sort_order order>
|
4895
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
4896
|
+
// bitonic sort
|
4897
|
+
int col = threadIdx.x;
|
4898
|
+
int row = blockIdx.y;
|
4899
|
+
|
4900
|
+
if (col >= ncols) return;
|
4901
|
+
|
4902
|
+
const float * x_row = x + row * ncols;
|
4903
|
+
int * dst_row = dst + row * ncols;
|
4904
|
+
|
4905
|
+
// initialize indices
|
4906
|
+
if (col < ncols) {
|
4907
|
+
dst_row[col] = col;
|
4908
|
+
}
|
4909
|
+
__syncthreads();
|
4910
|
+
|
4911
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
4912
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
4913
|
+
int ixj = col ^ j;
|
4914
|
+
if (ixj > col) {
|
4915
|
+
if ((col & k) == 0) {
|
4916
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
4917
|
+
swap(dst_row[col], dst_row[ixj]);
|
4918
|
+
}
|
4919
|
+
} else {
|
4920
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
4921
|
+
swap(dst_row[col], dst_row[ixj]);
|
4922
|
+
}
|
4923
|
+
}
|
4924
|
+
}
|
4925
|
+
__syncthreads();
|
4926
|
+
}
|
4927
|
+
}
|
4928
|
+
}
|
4929
|
+
|
4716
4930
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4717
4931
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4718
4932
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4722,8 +4936,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4722
4936
|
}
|
4723
4937
|
|
4724
4938
|
const int i = row*ncols + col;
|
4725
|
-
//
|
4726
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4939
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
4940
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4941
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4727
4942
|
}
|
4728
4943
|
|
4729
4944
|
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
@@ -4845,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
|
|
4845
5060
|
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4846
5061
|
}
|
4847
5062
|
|
4848
|
-
|
4849
|
-
|
4850
|
-
|
4851
|
-
|
4852
|
-
|
4853
|
-
|
4854
|
-
|
4855
|
-
|
4856
|
-
|
4857
|
-
|
4858
|
-
|
4859
|
-
|
4860
|
-
|
4861
|
-
|
5063
|
+
template<float (*bin_op)(const float, const float)>
|
5064
|
+
struct bin_bcast_cuda {
|
5065
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5066
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5067
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5068
|
+
cudaStream_t stream) {
|
5069
|
+
|
5070
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
|
+
|
5072
|
+
|
5073
|
+
int nr0 = ne10/ne0;
|
5074
|
+
int nr1 = ne11/ne1;
|
5075
|
+
int nr2 = ne12/ne2;
|
5076
|
+
int nr3 = ne13/ne3;
|
5077
|
+
|
5078
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5079
|
+
|
5080
|
+
// collapse dimensions until first broadcast dimension
|
5081
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5082
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5083
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5084
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5085
|
+
auto collapse = [](int64_t cne[]) {
|
5086
|
+
cne[0] *= cne[1];
|
5087
|
+
cne[1] = cne[2];
|
5088
|
+
cne[2] = cne[3];
|
5089
|
+
cne[3] = 1;
|
5090
|
+
};
|
5091
|
+
|
5092
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5093
|
+
cnb[1] *= cne[1];
|
5094
|
+
cnb[2] *= cne[2];
|
5095
|
+
cnb[3] *= cne[3];
|
5096
|
+
};
|
5097
|
+
|
5098
|
+
for (int i = 0; i < 4; i++) {
|
5099
|
+
if (nr[i] != 1) {
|
5100
|
+
break;
|
5101
|
+
}
|
5102
|
+
if (i > 0) {
|
5103
|
+
collapse_nb(cnb0, cne0);
|
5104
|
+
collapse_nb(cnb1, cne1);
|
5105
|
+
collapse(cne0);
|
5106
|
+
collapse(cne1);
|
5107
|
+
}
|
5108
|
+
}
|
5109
|
+
{
|
5110
|
+
int64_t ne0 = cne0[0];
|
5111
|
+
int64_t ne1 = cne0[1];
|
5112
|
+
int64_t ne2 = cne0[2];
|
5113
|
+
int64_t ne3 = cne0[3];
|
5114
|
+
|
5115
|
+
int64_t ne10 = cne1[0];
|
5116
|
+
int64_t ne11 = cne1[1];
|
5117
|
+
int64_t ne12 = cne1[2];
|
5118
|
+
int64_t ne13 = cne1[3];
|
5119
|
+
|
5120
|
+
//size_t nb0 = cnb0[0];
|
5121
|
+
size_t nb1 = cnb0[1];
|
5122
|
+
size_t nb2 = cnb0[2];
|
5123
|
+
size_t nb3 = cnb0[3];
|
5124
|
+
|
5125
|
+
//size_t nb10 = cnb1[0];
|
5126
|
+
size_t nb11 = cnb1[1];
|
5127
|
+
size_t nb12 = cnb1[2];
|
5128
|
+
size_t nb13 = cnb1[3];
|
5129
|
+
|
5130
|
+
//size_t s0 = nb0 / sizeof(src1_t);
|
5131
|
+
size_t s1 = nb1 / sizeof(src1_t);
|
5132
|
+
size_t s2 = nb2 / sizeof(src1_t);
|
5133
|
+
size_t s3 = nb3 / sizeof(src1_t);
|
5134
|
+
|
5135
|
+
//size_t s10 = nb10 / sizeof(src1_t);
|
5136
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5137
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5138
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5139
|
+
|
5140
|
+
|
5141
|
+
const int block_size = 128;
|
5142
|
+
|
5143
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5144
|
+
|
5145
|
+
dim3 block_dims;
|
5146
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5147
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5148
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5149
|
+
|
5150
|
+
dim3 block_nums(
|
5151
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5152
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5153
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5154
|
+
);
|
4862
5155
|
|
4863
|
-
|
4864
|
-
|
4865
|
-
|
4866
|
-
|
5156
|
+
if (block_nums.z > 65535) {
|
5157
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5158
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5159
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5160
|
+
src0_dd, src1_dd, dst_dd,
|
5161
|
+
ne0, ne1, ne2, ne3,
|
5162
|
+
ne10, ne11, ne12, ne13,
|
5163
|
+
/* s0, */ s1, s2, s3,
|
5164
|
+
/* s10, */ s11, s12, s13);
|
5165
|
+
} else {
|
5166
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5167
|
+
src0_dd, src1_dd, dst_dd,
|
5168
|
+
ne0, ne1, ne2, ne3,
|
5169
|
+
ne10, ne11, ne12, ne13,
|
5170
|
+
/* s0, */ s1, s2, s3,
|
5171
|
+
/* s10, */ s11, s12, s13);
|
5172
|
+
}
|
5173
|
+
}
|
5174
|
+
}
|
5175
|
+
};
|
4867
5176
|
|
4868
5177
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4869
5178
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
@@ -4885,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
4885
5194
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4886
5195
|
}
|
4887
5196
|
|
4888
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5197
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4889
5198
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4890
5199
|
if (ncols < 1024) {
|
4891
5200
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4892
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5201
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4893
5202
|
} else {
|
4894
5203
|
const dim3 block_dims(1024, 1, 1);
|
4895
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5204
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4896
5205
|
}
|
4897
5206
|
}
|
4898
5207
|
|
@@ -4914,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4914
5223
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4915
5224
|
}
|
4916
5225
|
|
4917
|
-
template<typename dst_t>
|
4918
|
-
static void
|
4919
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4920
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4921
|
-
}
|
4922
|
-
|
4923
|
-
template<typename dst_t>
|
4924
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4925
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4926
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4927
|
-
}
|
4928
|
-
|
4929
|
-
template<typename dst_t>
|
4930
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4931
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4932
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4933
|
-
}
|
4934
|
-
|
4935
|
-
template<typename dst_t>
|
4936
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4937
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4938
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4939
|
-
}
|
4940
|
-
|
4941
|
-
template<typename dst_t>
|
4942
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5226
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5227
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4943
5228
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4944
|
-
dequantize_block<
|
5229
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4945
5230
|
}
|
4946
5231
|
|
4947
5232
|
template<typename dst_t>
|
@@ -4990,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4990
5275
|
#endif
|
4991
5276
|
}
|
4992
5277
|
|
5278
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5279
|
+
switch (type) {
|
5280
|
+
case GGML_TYPE_Q4_0:
|
5281
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5282
|
+
case GGML_TYPE_Q4_1:
|
5283
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5284
|
+
case GGML_TYPE_Q5_0:
|
5285
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5286
|
+
case GGML_TYPE_Q5_1:
|
5287
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5288
|
+
case GGML_TYPE_Q8_0:
|
5289
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5290
|
+
case GGML_TYPE_Q2_K:
|
5291
|
+
return dequantize_row_q2_K_cuda;
|
5292
|
+
case GGML_TYPE_Q3_K:
|
5293
|
+
return dequantize_row_q3_K_cuda;
|
5294
|
+
case GGML_TYPE_Q4_K:
|
5295
|
+
return dequantize_row_q4_K_cuda;
|
5296
|
+
case GGML_TYPE_Q5_K:
|
5297
|
+
return dequantize_row_q5_K_cuda;
|
5298
|
+
case GGML_TYPE_Q6_K:
|
5299
|
+
return dequantize_row_q6_K_cuda;
|
5300
|
+
case GGML_TYPE_F32:
|
5301
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5302
|
+
default:
|
5303
|
+
return nullptr;
|
5304
|
+
}
|
5305
|
+
}
|
5306
|
+
|
5307
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5308
|
+
switch (type) {
|
5309
|
+
case GGML_TYPE_Q4_0:
|
5310
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5311
|
+
case GGML_TYPE_Q4_1:
|
5312
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5313
|
+
case GGML_TYPE_Q5_0:
|
5314
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5315
|
+
case GGML_TYPE_Q5_1:
|
5316
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5317
|
+
case GGML_TYPE_Q8_0:
|
5318
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5319
|
+
case GGML_TYPE_Q2_K:
|
5320
|
+
return dequantize_row_q2_K_cuda;
|
5321
|
+
case GGML_TYPE_Q3_K:
|
5322
|
+
return dequantize_row_q3_K_cuda;
|
5323
|
+
case GGML_TYPE_Q4_K:
|
5324
|
+
return dequantize_row_q4_K_cuda;
|
5325
|
+
case GGML_TYPE_Q5_K:
|
5326
|
+
return dequantize_row_q5_K_cuda;
|
5327
|
+
case GGML_TYPE_Q6_K:
|
5328
|
+
return dequantize_row_q6_K_cuda;
|
5329
|
+
case GGML_TYPE_F16:
|
5330
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5331
|
+
default:
|
5332
|
+
return nullptr;
|
5333
|
+
}
|
5334
|
+
}
|
5335
|
+
|
4993
5336
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4994
5337
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4995
5338
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5078,6 +5421,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5078
5421
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5079
5422
|
}
|
5080
5423
|
|
5424
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5425
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5426
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5427
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5428
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5429
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5430
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5431
|
+
}
|
5432
|
+
|
5081
5433
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5082
5434
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
5083
5435
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5168,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5168
5520
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5169
5521
|
}
|
5170
5522
|
|
5171
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5172
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5173
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5174
|
-
}
|
5175
|
-
|
5176
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5177
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5178
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5179
|
-
}
|
5180
|
-
|
5181
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5182
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5183
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5184
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5185
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5186
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5187
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5188
|
-
}
|
5189
|
-
|
5190
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5191
|
-
switch (type) {
|
5192
|
-
case GGML_TYPE_Q4_0:
|
5193
|
-
return dequantize_row_q4_0_cuda;
|
5194
|
-
case GGML_TYPE_Q4_1:
|
5195
|
-
return dequantize_row_q4_1_cuda;
|
5196
|
-
case GGML_TYPE_Q5_0:
|
5197
|
-
return dequantize_row_q5_0_cuda;
|
5198
|
-
case GGML_TYPE_Q5_1:
|
5199
|
-
return dequantize_row_q5_1_cuda;
|
5200
|
-
case GGML_TYPE_Q8_0:
|
5201
|
-
return dequantize_row_q8_0_cuda;
|
5202
|
-
case GGML_TYPE_Q2_K:
|
5203
|
-
return dequantize_row_q2_K_cuda;
|
5204
|
-
case GGML_TYPE_Q3_K:
|
5205
|
-
return dequantize_row_q3_K_cuda;
|
5206
|
-
case GGML_TYPE_Q4_K:
|
5207
|
-
return dequantize_row_q4_K_cuda;
|
5208
|
-
case GGML_TYPE_Q5_K:
|
5209
|
-
return dequantize_row_q5_K_cuda;
|
5210
|
-
case GGML_TYPE_Q6_K:
|
5211
|
-
return dequantize_row_q6_K_cuda;
|
5212
|
-
case GGML_TYPE_F32:
|
5213
|
-
return convert_fp32_to_fp16_cuda;
|
5214
|
-
default:
|
5215
|
-
return nullptr;
|
5216
|
-
}
|
5217
|
-
}
|
5218
|
-
|
5219
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5220
|
-
switch (type) {
|
5221
|
-
case GGML_TYPE_Q4_0:
|
5222
|
-
return dequantize_row_q4_0_cuda;
|
5223
|
-
case GGML_TYPE_Q4_1:
|
5224
|
-
return dequantize_row_q4_1_cuda;
|
5225
|
-
case GGML_TYPE_Q5_0:
|
5226
|
-
return dequantize_row_q5_0_cuda;
|
5227
|
-
case GGML_TYPE_Q5_1:
|
5228
|
-
return dequantize_row_q5_1_cuda;
|
5229
|
-
case GGML_TYPE_Q8_0:
|
5230
|
-
return dequantize_row_q8_0_cuda;
|
5231
|
-
case GGML_TYPE_Q2_K:
|
5232
|
-
return dequantize_row_q2_K_cuda;
|
5233
|
-
case GGML_TYPE_Q3_K:
|
5234
|
-
return dequantize_row_q3_K_cuda;
|
5235
|
-
case GGML_TYPE_Q4_K:
|
5236
|
-
return dequantize_row_q4_K_cuda;
|
5237
|
-
case GGML_TYPE_Q5_K:
|
5238
|
-
return dequantize_row_q5_K_cuda;
|
5239
|
-
case GGML_TYPE_Q6_K:
|
5240
|
-
return dequantize_row_q6_K_cuda;
|
5241
|
-
case GGML_TYPE_F16:
|
5242
|
-
return convert_fp16_to_fp32_cuda;
|
5243
|
-
default:
|
5244
|
-
return nullptr;
|
5245
|
-
}
|
5246
|
-
}
|
5247
|
-
|
5248
5523
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5249
5524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5250
5525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5737,19 +6012,52 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5737
6012
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5738
6013
|
}
|
5739
6014
|
|
5740
|
-
static void
|
6015
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
5741
6016
|
const char * cx, char * cdst, const int ne,
|
5742
6017
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5743
6018
|
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5744
6019
|
|
5745
|
-
|
5746
|
-
|
6020
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6021
|
+
const int num_blocks = ne / QK8_0;
|
6022
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
5747
6023
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5748
6024
|
}
|
5749
6025
|
|
5750
|
-
static void
|
5751
|
-
const
|
5752
|
-
|
6026
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6027
|
+
const char * cx, char * cdst, const int ne,
|
6028
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6029
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6030
|
+
|
6031
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6032
|
+
const int num_blocks = ne / QK4_0;
|
6033
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6034
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6035
|
+
}
|
6036
|
+
|
6037
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6038
|
+
const char * cx, char * cdst, const int ne,
|
6039
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6040
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6041
|
+
|
6042
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6043
|
+
const int num_blocks = ne / QK4_1;
|
6044
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6045
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6046
|
+
}
|
6047
|
+
|
6048
|
+
static void ggml_cpy_f16_f16_cuda(
|
6049
|
+
const char * cx, char * cdst, const int ne,
|
6050
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6051
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6052
|
+
|
6053
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
6054
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
6055
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6056
|
+
}
|
6057
|
+
|
6058
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
6059
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
6060
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5753
6061
|
}
|
5754
6062
|
|
5755
6063
|
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
@@ -5823,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5823
6131
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5824
6132
|
}
|
5825
6133
|
|
6134
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6135
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6136
|
+
const dim3 block_nums(1, nrows, 1);
|
6137
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6138
|
+
}
|
6139
|
+
|
6140
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6141
|
+
// bitonic sort requires ncols to be power of 2
|
6142
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6143
|
+
|
6144
|
+
const dim3 block_dims(ncols, 1, 1);
|
6145
|
+
const dim3 block_nums(1, nrows, 1);
|
6146
|
+
if (order == GGML_SORT_ASC) {
|
6147
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6148
|
+
} else if (order == GGML_SORT_DESC) {
|
6149
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6150
|
+
} else {
|
6151
|
+
GGML_ASSERT(false);
|
6152
|
+
}
|
6153
|
+
}
|
6154
|
+
|
5826
6155
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5827
6156
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5828
6157
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5915,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5915
6244
|
return ptr;
|
5916
6245
|
}
|
5917
6246
|
#ifdef DEBUG_CUDA_MALLOC
|
5918
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6247
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5919
6248
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5920
6249
|
#endif
|
5921
6250
|
void * ptr;
|
@@ -6053,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6053
6382
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6054
6383
|
// This can fixed the OOM error in WSL.
|
6055
6384
|
cudaGetLastError();
|
6056
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6385
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6057
6386
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6058
6387
|
return nullptr;
|
6059
6388
|
}
|
@@ -6098,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6098
6427
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6099
6428
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6100
6429
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6101
|
-
}
|
6102
|
-
if (nb0 == ts) {
|
6430
|
+
} else if (nb0 == ts) {
|
6103
6431
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6104
|
-
}
|
6105
|
-
|
6106
|
-
|
6107
|
-
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
}
|
6112
|
-
return cudaSuccess;
|
6113
|
-
}
|
6114
|
-
|
6115
|
-
static void ggml_cuda_op_repeat(
|
6116
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6117
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6118
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6119
|
-
const int64_t ne0 = dst->ne[0];
|
6120
|
-
const int64_t ne1 = dst->ne[1];
|
6121
|
-
const int64_t ne2 = dst->ne[2];
|
6122
|
-
const int64_t ne3 = dst->ne[3];
|
6123
|
-
|
6124
|
-
const int64_t ne00 = src0->ne[0];
|
6125
|
-
const int64_t ne01 = src0->ne[1];
|
6126
|
-
const int64_t ne02 = src0->ne[2];
|
6127
|
-
const int64_t ne03 = src0->ne[3];
|
6128
|
-
|
6129
|
-
const size_t nb0 = dst->nb[0];
|
6130
|
-
const size_t nb1 = dst->nb[1];
|
6131
|
-
const size_t nb2 = dst->nb[2];
|
6132
|
-
const size_t nb3 = dst->nb[3];
|
6133
|
-
|
6134
|
-
const size_t nb00 = src0->nb[0];
|
6135
|
-
const size_t nb01 = src0->nb[1];
|
6136
|
-
const size_t nb02 = src0->nb[2];
|
6137
|
-
const size_t nb03 = src0->nb[3];
|
6138
|
-
|
6139
|
-
const int nr0 = (int)(ne0/ne00);
|
6140
|
-
const int nr1 = (int)(ne1/ne01);
|
6141
|
-
const int nr2 = (int)(ne2/ne02);
|
6142
|
-
const int nr3 = (int)(ne3/ne03);
|
6143
|
-
|
6144
|
-
// TODO: support for transposed / permuted tensors
|
6145
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6146
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6147
|
-
|
6148
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6149
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6150
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6151
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6152
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6153
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6154
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6155
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6156
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6157
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6158
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6159
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6160
|
-
}
|
6161
|
-
}
|
6162
|
-
}
|
6163
|
-
}
|
6164
|
-
}
|
6432
|
+
} else {
|
6433
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6434
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6435
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6436
|
+
// pretend the row is a matrix with cols=1
|
6437
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6438
|
+
if (r != cudaSuccess) return r;
|
6165
6439
|
}
|
6440
|
+
return cudaSuccess;
|
6166
6441
|
}
|
6167
|
-
|
6168
|
-
(void) src1;
|
6169
|
-
(void) src1_d;
|
6170
6442
|
}
|
6171
6443
|
|
6172
6444
|
static void ggml_cuda_op_get_rows(
|
@@ -6213,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
|
|
6213
6485
|
}
|
6214
6486
|
}
|
6215
6487
|
|
6216
|
-
|
6488
|
+
template<class op>
|
6489
|
+
inline void ggml_cuda_op_bin_bcast(
|
6217
6490
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6218
6491
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6219
6492
|
|
6220
6493
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6221
6494
|
|
6222
|
-
const int64_t ne10 = src1->ne[0];
|
6223
|
-
const int64_t ne11 = src1->ne[1];
|
6224
|
-
|
6225
6495
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6226
|
-
|
6496
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6227
6497
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6228
|
-
|
6498
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6229
6499
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6230
|
-
|
6500
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6231
6501
|
} else {
|
6232
|
-
fprintf(stderr, "src0
|
6502
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6503
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6233
6504
|
GGML_ASSERT(false);
|
6234
6505
|
}
|
6506
|
+
}
|
6507
|
+
|
6508
|
+
static void ggml_cuda_op_repeat(
|
6509
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6510
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6511
|
+
|
6512
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6235
6513
|
|
6236
6514
|
(void) src1;
|
6237
|
-
(void)
|
6515
|
+
(void) src1_d;
|
6238
6516
|
}
|
6239
6517
|
|
6240
|
-
inline void
|
6518
|
+
inline void ggml_cuda_op_add(
|
6241
6519
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6242
6520
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6243
6521
|
|
6244
|
-
|
6245
|
-
|
6246
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6522
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
|
+
}
|
6247
6524
|
|
6248
|
-
|
6249
|
-
const
|
6525
|
+
inline void ggml_cuda_op_mul(
|
6526
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6250
6528
|
|
6251
|
-
|
6529
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6530
|
+
}
|
6252
6531
|
|
6253
|
-
|
6532
|
+
inline void ggml_cuda_op_div(
|
6533
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6534
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6535
|
+
|
6536
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6254
6537
|
}
|
6255
6538
|
|
6256
6539
|
inline void ggml_cuda_op_gelu(
|
@@ -6319,7 +6602,10 @@ inline void ggml_cuda_op_norm(
|
|
6319
6602
|
const int64_t ne00 = src0->ne[0];
|
6320
6603
|
const int64_t nrows = ggml_nrows(src0);
|
6321
6604
|
|
6322
|
-
|
6605
|
+
float eps;
|
6606
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
6607
|
+
|
6608
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6323
6609
|
|
6324
6610
|
(void) src1;
|
6325
6611
|
(void) dst;
|
@@ -6474,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6474
6760
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6475
6761
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6476
6762
|
|
6763
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
6764
|
+
|
6477
6765
|
const int64_t ne00 = src0->ne[0];
|
6478
6766
|
const int64_t row_diff = row_high - row_low;
|
6479
6767
|
|
@@ -6533,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6533
6821
|
size_t ash;
|
6534
6822
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6535
6823
|
|
6536
|
-
bool src1_convert_f16 =
|
6824
|
+
bool src1_convert_f16 =
|
6825
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6537
6826
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6538
6827
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6539
6828
|
|
@@ -6859,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
|
|
6859
7148
|
(void) src0_dd;
|
6860
7149
|
}
|
6861
7150
|
|
7151
|
+
inline void ggml_cuda_op_sum_rows(
|
7152
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7154
|
+
|
7155
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7156
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7157
|
+
|
7158
|
+
const int64_t ncols = src0->ne[0];
|
7159
|
+
const int64_t nrows = ggml_nrows(src0);
|
7160
|
+
|
7161
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7162
|
+
|
7163
|
+
(void) src1;
|
7164
|
+
(void) dst;
|
7165
|
+
(void) src1_dd;
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
inline void ggml_cuda_op_argsort(
|
7169
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7170
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7171
|
+
|
7172
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7173
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7174
|
+
|
7175
|
+
const int64_t ncols = src0->ne[0];
|
7176
|
+
const int64_t nrows = ggml_nrows(src0);
|
7177
|
+
|
7178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7179
|
+
|
7180
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7181
|
+
|
7182
|
+
(void) src1;
|
7183
|
+
(void) dst;
|
7184
|
+
(void) src1_dd;
|
7185
|
+
}
|
7186
|
+
|
6862
7187
|
inline void ggml_cuda_op_diag_mask_inf(
|
6863
7188
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6864
7189
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7067,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7067
7392
|
const int64_t ne01 = src0->ne[1];
|
7068
7393
|
const int64_t ne02 = src0->ne[2];
|
7069
7394
|
const int64_t ne03 = src0->ne[3];
|
7070
|
-
|
7395
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7071
7396
|
|
7072
7397
|
const int64_t ne10 = src1->ne[0];
|
7073
7398
|
const int64_t ne11 = src1->ne[1];
|
@@ -7103,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7103
7428
|
|
7104
7429
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7105
7430
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7106
|
-
|
7107
7431
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7108
|
-
|
7109
|
-
|
7432
|
+
|
7433
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7110
7434
|
|
7111
7435
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7112
7436
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7231,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7231
7555
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7232
7556
|
|
7233
7557
|
// for split tensors the data begins at i0 == i0_offset_low
|
7234
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
7558
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7235
7559
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7236
7560
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7237
7561
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7376,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7376
7700
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7377
7701
|
}
|
7378
7702
|
|
7703
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7704
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
7705
|
+
}
|
7706
|
+
|
7379
7707
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7380
7708
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7381
7709
|
}
|
@@ -7401,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7401
7729
|
}
|
7402
7730
|
|
7403
7731
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7404
|
-
if (!g_cublas_loaded)
|
7732
|
+
if (!g_cublas_loaded) return false;
|
7405
7733
|
|
7406
7734
|
const int64_t ne10 = src1->ne[0];
|
7407
7735
|
|
@@ -7479,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7479
7807
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7480
7808
|
}
|
7481
7809
|
|
7482
|
-
__global__
|
7810
|
+
static __global__ void k_compute_batched_ptrs(
|
7483
7811
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7484
7812
|
const void ** ptrs_src, void ** ptrs_dst,
|
7485
7813
|
int ne12, int ne13,
|
@@ -7535,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7535
7863
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7536
7864
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7537
7865
|
|
7538
|
-
|
7539
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7540
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7866
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7541
7867
|
|
7542
7868
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7543
7869
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7594,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7594
7920
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7595
7921
|
// use cublasGemmStridedBatchedEx
|
7596
7922
|
CUBLAS_CHECK(
|
7597
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
7923
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7598
7924
|
ne01, ne11, ne10,
|
7599
7925
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7600
7926
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7628,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7628
7954
|
CUDA_CHECK(cudaGetLastError());
|
7629
7955
|
|
7630
7956
|
CUBLAS_CHECK(
|
7631
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
7957
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7632
7958
|
ne01, ne11, ne10,
|
7633
7959
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7634
7960
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7698,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7698
8024
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7699
8025
|
const bool use_mul_mat_vec_q = false;
|
7700
8026
|
#else
|
7701
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8027
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7702
8028
|
#endif // GGML_CUDA_FORCE_DMMV
|
7703
8029
|
|
7704
8030
|
if (use_mul_mat_vec_q) {
|
8031
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7705
8032
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7706
8033
|
} else {
|
7707
8034
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7726,6 +8053,219 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7726
8053
|
}
|
7727
8054
|
}
|
7728
8055
|
|
8056
|
+
#if 0
|
8057
|
+
template<typename ... Srcs>
|
8058
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8059
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8060
|
+
int ne12, int ne13,
|
8061
|
+
int ne23,
|
8062
|
+
int nb02, int nb03,
|
8063
|
+
int nb12, int nb13,
|
8064
|
+
int nb2, int nb3,
|
8065
|
+
int r2, int r3,
|
8066
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8067
|
+
const half * src1_f16, half * dst_f16,
|
8068
|
+
const int32_t * ids, const int id,
|
8069
|
+
Srcs... src0s) {
|
8070
|
+
|
8071
|
+
int i = ids[id];
|
8072
|
+
|
8073
|
+
half * src0_f16;
|
8074
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8075
|
+
if (src0_type == GGML_TYPE_F16) {
|
8076
|
+
src0_f16 = (half *) srcs_ar[i];
|
8077
|
+
} else {
|
8078
|
+
src0_f16 = src0_as_f16;
|
8079
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8080
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8081
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8082
|
+
}
|
8083
|
+
}
|
8084
|
+
|
8085
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8086
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8087
|
+
|
8088
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8089
|
+
return;
|
8090
|
+
}
|
8091
|
+
|
8092
|
+
int i03 = i13 / r3;
|
8093
|
+
int i02 = i12 / r2;
|
8094
|
+
|
8095
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8096
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8097
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
8098
|
+
}
|
8099
|
+
|
8100
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8101
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8102
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8103
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
8104
|
+
|
8105
|
+
const int id = dst->op_params[0];
|
8106
|
+
|
8107
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8108
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
8109
|
+
|
8110
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8111
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8112
|
+
|
8113
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8114
|
+
const int64_t ne01 = src00->ne[1];
|
8115
|
+
const int64_t ne02 = src00->ne[2];
|
8116
|
+
const int64_t ne03 = src00->ne[3];
|
8117
|
+
|
8118
|
+
//const int64_t nb01 = src00->nb[1];
|
8119
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8120
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
8121
|
+
|
8122
|
+
const int64_t ne10 = src1->ne[0];
|
8123
|
+
const int64_t ne11 = src1->ne[1];
|
8124
|
+
const int64_t ne12 = src1->ne[2];
|
8125
|
+
const int64_t ne13 = src1->ne[3];
|
8126
|
+
|
8127
|
+
//const int64_t nb11 = src1->nb[1];
|
8128
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8129
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8130
|
+
|
8131
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8132
|
+
const int64_t ne = ggml_nelements(dst);
|
8133
|
+
|
8134
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8135
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8136
|
+
|
8137
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8138
|
+
|
8139
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8140
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8141
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8142
|
+
|
8143
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8144
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8145
|
+
|
8146
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8147
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8148
|
+
|
8149
|
+
// convert src1 to fp16
|
8150
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8151
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8152
|
+
|
8153
|
+
size_t src1_as = 0;
|
8154
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8155
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8156
|
+
|
8157
|
+
size_t dst_as = 0;
|
8158
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8159
|
+
|
8160
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8161
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8162
|
+
|
8163
|
+
// broadcast factors
|
8164
|
+
const int64_t r2 = ne12/ne02;
|
8165
|
+
const int64_t r3 = ne13/ne03;
|
8166
|
+
|
8167
|
+
const half alpha_f16 = 1.0f;
|
8168
|
+
const half beta_f16 = 0.0f;
|
8169
|
+
|
8170
|
+
// use cublasGemmBatchedEx
|
8171
|
+
const int ne23 = ne12*ne13;
|
8172
|
+
|
8173
|
+
const void ** ptrs_src = nullptr;
|
8174
|
+
void ** ptrs_dst = nullptr;
|
8175
|
+
|
8176
|
+
size_t ptrs_src_s = 0;
|
8177
|
+
size_t ptrs_dst_s = 0;
|
8178
|
+
|
8179
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8180
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8181
|
+
|
8182
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8183
|
+
half * src0_as_f16 = nullptr;
|
8184
|
+
size_t src0_as = 0;
|
8185
|
+
if (src00->type != GGML_TYPE_F16) {
|
8186
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8187
|
+
}
|
8188
|
+
|
8189
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8190
|
+
dim3 block_dims(ne13, ne12);
|
8191
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8192
|
+
ptrs_src, ptrs_dst,
|
8193
|
+
ne12, ne13,
|
8194
|
+
ne23,
|
8195
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8196
|
+
nb12, nb13,
|
8197
|
+
dst->nb[2], dst->nb[3],
|
8198
|
+
r2, r3,
|
8199
|
+
src00->type, src0_as_f16, src0_ne,
|
8200
|
+
src1_as_f16, dst_f16,
|
8201
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8202
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8203
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8204
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8205
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8206
|
+
);
|
8207
|
+
CUDA_CHECK(cudaGetLastError());
|
8208
|
+
|
8209
|
+
CUBLAS_CHECK(
|
8210
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8211
|
+
ne01, ne11, ne10,
|
8212
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8213
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8214
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8215
|
+
ne23,
|
8216
|
+
CUBLAS_COMPUTE_16F,
|
8217
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8218
|
+
|
8219
|
+
if (src0_as != 0) {
|
8220
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8221
|
+
}
|
8222
|
+
if (ptrs_src_s != 0) {
|
8223
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8224
|
+
}
|
8225
|
+
if (ptrs_dst_s != 0) {
|
8226
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8227
|
+
}
|
8228
|
+
|
8229
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8230
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8231
|
+
|
8232
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8233
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8234
|
+
}
|
8235
|
+
#endif
|
8236
|
+
|
8237
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
|
8238
|
+
#if 0
|
8239
|
+
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
+
// const bool use_tensor_cores = true;
|
8241
|
+
//#else
|
8242
|
+
// const bool use_tensor_cores = false;
|
8243
|
+
//#endif
|
8244
|
+
|
8245
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
+
|
8247
|
+
// TODO: mmq/mmv support
|
8248
|
+
#else
|
8249
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
+
const int id = dst->op_params[0];
|
8252
|
+
|
8253
|
+
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8254
|
+
|
8255
|
+
int32_t a_id;
|
8256
|
+
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8257
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8258
|
+
|
8259
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
8260
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8261
|
+
|
8262
|
+
ggml_cuda_mul_mat(src0, src1, dst);
|
8263
|
+
#endif
|
8264
|
+
|
8265
|
+
(void) _src0;
|
8266
|
+
(void) _src1;
|
8267
|
+
}
|
8268
|
+
|
7729
8269
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7730
8270
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7731
8271
|
}
|
@@ -7770,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7770
8310
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7771
8311
|
|
7772
8312
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7773
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7774
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8313
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7775
8314
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7776
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7777
|
-
|
8315
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8316
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8317
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8318
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8319
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8320
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8321
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7778
8322
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7779
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7780
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8323
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7781
8324
|
} else {
|
7782
8325
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7783
8326
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7788,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7788
8331
|
}
|
7789
8332
|
|
7790
8333
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8334
|
+
// TODO: why do we pass dst as src1 here?
|
7791
8335
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7792
8336
|
(void) src1;
|
7793
8337
|
}
|
@@ -7813,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7813
8357
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7814
8358
|
}
|
7815
8359
|
|
8360
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8361
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8362
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8363
|
+
}
|
8364
|
+
|
8365
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8366
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8367
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8368
|
+
}
|
8369
|
+
|
7816
8370
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7817
8371
|
(void) src0;
|
7818
8372
|
(void) src1;
|
@@ -8068,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8068
8622
|
main_device, g_device_count, g_main_device);
|
8069
8623
|
return;
|
8070
8624
|
}
|
8071
|
-
|
8072
|
-
if (g_device_count > 1) {
|
8625
|
+
|
8626
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
8627
|
+
g_main_device = main_device;
|
8073
8628
|
cudaDeviceProp prop;
|
8074
8629
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8075
8630
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8095,7 +8650,7 @@ void ggml_cuda_free_scratch() {
|
|
8095
8650
|
}
|
8096
8651
|
|
8097
8652
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8098
|
-
if (!g_cublas_loaded)
|
8653
|
+
if (!g_cublas_loaded) return false;
|
8099
8654
|
|
8100
8655
|
ggml_cuda_func_t func;
|
8101
8656
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8131,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8131
8686
|
case GGML_OP_MUL:
|
8132
8687
|
func = ggml_cuda_mul;
|
8133
8688
|
break;
|
8689
|
+
case GGML_OP_DIV:
|
8690
|
+
func = ggml_cuda_div;
|
8691
|
+
break;
|
8134
8692
|
case GGML_OP_UNARY:
|
8135
8693
|
switch (ggml_get_unary_op(tensor)) {
|
8136
8694
|
case GGML_UNARY_OP_GELU:
|
@@ -8144,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8144
8702
|
break;
|
8145
8703
|
default:
|
8146
8704
|
return false;
|
8147
|
-
}
|
8705
|
+
}
|
8706
|
+
break;
|
8148
8707
|
case GGML_OP_NORM:
|
8149
8708
|
func = ggml_cuda_norm;
|
8150
8709
|
break;
|
@@ -8157,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8157
8716
|
}
|
8158
8717
|
func = ggml_cuda_mul_mat;
|
8159
8718
|
break;
|
8719
|
+
case GGML_OP_MUL_MAT_ID:
|
8720
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
8721
|
+
return false;
|
8722
|
+
}
|
8723
|
+
func = ggml_cuda_mul_mat_id;
|
8724
|
+
break;
|
8160
8725
|
case GGML_OP_SCALE:
|
8161
8726
|
func = ggml_cuda_scale;
|
8162
8727
|
break;
|
@@ -8196,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8196
8761
|
case GGML_OP_IM2COL:
|
8197
8762
|
func = ggml_cuda_im2col;
|
8198
8763
|
break;
|
8764
|
+
case GGML_OP_SUM_ROWS:
|
8765
|
+
func = ggml_cuda_sum_rows;
|
8766
|
+
break;
|
8767
|
+
case GGML_OP_ARGSORT:
|
8768
|
+
func = ggml_cuda_argsort;
|
8769
|
+
break;
|
8199
8770
|
default:
|
8200
8771
|
return false;
|
8201
8772
|
}
|
@@ -8212,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8212
8783
|
|
8213
8784
|
int ggml_cuda_get_device_count() {
|
8214
8785
|
int device_count;
|
8215
|
-
|
8786
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
8787
|
+
return 0;
|
8788
|
+
}
|
8216
8789
|
return device_count;
|
8217
8790
|
}
|
8218
8791
|
|
@@ -8228,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8228
8801
|
|
8229
8802
|
#define UNUSED GGML_UNUSED
|
8230
8803
|
|
8231
|
-
|
8232
|
-
};
|
8233
|
-
|
8234
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8235
|
-
return GGML_CUDA_NAME;
|
8236
|
-
|
8237
|
-
UNUSED(backend);
|
8238
|
-
}
|
8239
|
-
|
8240
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8241
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8242
|
-
delete cuda_ctx;
|
8243
|
-
delete backend;
|
8244
|
-
}
|
8804
|
+
// cuda buffer
|
8245
8805
|
|
8246
8806
|
struct ggml_backend_buffer_context_cuda {
|
8247
|
-
|
8248
|
-
|
8807
|
+
int device;
|
8808
|
+
void * dev_ptr = nullptr;
|
8249
8809
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8250
8810
|
size_t temp_tensor_extra_index = 0;
|
8251
8811
|
|
8812
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
8813
|
+
|
8252
8814
|
~ggml_backend_buffer_context_cuda() {
|
8253
8815
|
delete[] temp_tensor_extras;
|
8254
8816
|
}
|
@@ -8269,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8269
8831
|
|
8270
8832
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8271
8833
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8272
|
-
CUDA_CHECK(cudaFree(ctx->
|
8834
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8273
8835
|
delete ctx;
|
8274
8836
|
}
|
8275
8837
|
|
8276
8838
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8277
8839
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8278
|
-
return ctx->
|
8279
|
-
}
|
8280
|
-
|
8281
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8282
|
-
int64_t row_low = 0;
|
8283
|
-
int64_t row_high = ggml_nrows(tensor);
|
8284
|
-
int64_t nrows_split = row_high - row_low;
|
8285
|
-
|
8286
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8287
|
-
|
8288
|
-
int64_t ne0 = tensor->ne[0];
|
8289
|
-
|
8290
|
-
if (ggml_is_quantized(tensor->type)) {
|
8291
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8292
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8293
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8294
|
-
}
|
8295
|
-
}
|
8296
|
-
|
8297
|
-
return size;
|
8298
|
-
|
8299
|
-
UNUSED(buffer);
|
8840
|
+
return ctx->dev_ptr;
|
8300
8841
|
}
|
8301
8842
|
|
8302
8843
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8303
8844
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8304
8845
|
|
8305
8846
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8306
|
-
assert(tensor->view_src->buffer->
|
8847
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8307
8848
|
tensor->backend = tensor->view_src->backend;
|
8308
8849
|
tensor->extra = tensor->view_src->extra;
|
8309
8850
|
return;
|
@@ -8311,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8311
8852
|
|
8312
8853
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8313
8854
|
|
8314
|
-
extra->data_device[
|
8855
|
+
extra->data_device[ctx->device] = tensor->data;
|
8315
8856
|
|
8316
8857
|
tensor->backend = GGML_BACKEND_GPU;
|
8317
8858
|
tensor->extra = extra;
|
@@ -8323,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8323
8864
|
int64_t nrows_split = row_high - row_low;
|
8324
8865
|
|
8325
8866
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8326
|
-
size_t padded_size =
|
8867
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8327
8868
|
|
8328
8869
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8329
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
8870
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8330
8871
|
}
|
8331
8872
|
}
|
8332
8873
|
|
8333
8874
|
UNUSED(buffer);
|
8334
8875
|
}
|
8335
8876
|
|
8877
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
8878
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8879
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8880
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8881
|
+
|
8882
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
8883
|
+
|
8884
|
+
UNUSED(buffer);
|
8885
|
+
}
|
8886
|
+
|
8887
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
8888
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8889
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8890
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
8893
|
+
|
8894
|
+
UNUSED(buffer);
|
8895
|
+
}
|
8896
|
+
|
8336
8897
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8337
|
-
/* .free_buffer
|
8338
|
-
/* .get_base
|
8339
|
-
/* .
|
8340
|
-
/* .
|
8341
|
-
/* .
|
8898
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
8899
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
8900
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
8901
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
8902
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
8903
|
+
/* .cpy_tensor_from = */ NULL,
|
8904
|
+
/* .cpy_tensor_to = */ NULL,
|
8342
8905
|
};
|
8343
8906
|
|
8344
|
-
|
8345
|
-
|
8907
|
+
// cuda buffer type
|
8908
|
+
|
8909
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8910
|
+
int device = (int) (intptr_t) buft->context;
|
8346
8911
|
|
8347
|
-
|
8912
|
+
ggml_cuda_set_device(device);
|
8348
8913
|
|
8349
8914
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8350
8915
|
|
8351
|
-
|
8352
|
-
CUDA_CHECK(cudaMalloc(&
|
8916
|
+
void * dev_ptr;
|
8917
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8918
|
+
|
8919
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
8353
8920
|
|
8354
|
-
return ggml_backend_buffer_init(
|
8921
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8355
8922
|
}
|
8356
8923
|
|
8357
|
-
static size_t
|
8924
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8358
8925
|
return 128;
|
8926
|
+
|
8927
|
+
UNUSED(buft);
|
8928
|
+
}
|
8929
|
+
|
8930
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
8931
|
+
int64_t row_low = 0;
|
8932
|
+
int64_t row_high = ggml_nrows(tensor);
|
8933
|
+
int64_t nrows_split = row_high - row_low;
|
8934
|
+
|
8935
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8936
|
+
|
8937
|
+
int64_t ne0 = tensor->ne[0];
|
8938
|
+
|
8939
|
+
if (ggml_is_quantized(tensor->type)) {
|
8940
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8943
|
+
}
|
8944
|
+
}
|
8945
|
+
|
8946
|
+
return size;
|
8947
|
+
|
8948
|
+
UNUSED(buft);
|
8949
|
+
}
|
8950
|
+
|
8951
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
8952
|
+
return ggml_backend_is_cuda(backend);
|
8953
|
+
|
8954
|
+
UNUSED(buft);
|
8955
|
+
}
|
8956
|
+
|
8957
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
8958
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
8959
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
8960
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
8961
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
8962
|
+
};
|
8963
|
+
|
8964
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
8965
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
8966
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
8967
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
8968
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
8969
|
+
ggml_backend_buffer_type_cuda[i] = {
|
8970
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
8971
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
8972
|
+
};
|
8973
|
+
}
|
8974
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
8975
|
+
}
|
8976
|
+
|
8977
|
+
return &ggml_backend_buffer_type_cuda[device];
|
8978
|
+
}
|
8979
|
+
|
8980
|
+
// host buffer type
|
8981
|
+
|
8982
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8983
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8984
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
8985
|
+
delete ctx;
|
8986
|
+
}
|
8987
|
+
|
8988
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8989
|
+
void * ptr;
|
8990
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
8991
|
+
|
8992
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
8993
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
8994
|
+
buffer->buft = buft;
|
8995
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
8996
|
+
|
8997
|
+
return buffer;
|
8998
|
+
|
8999
|
+
UNUSED(buft);
|
9000
|
+
}
|
9001
|
+
|
9002
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9003
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9004
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9005
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9006
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9007
|
+
};
|
9008
|
+
|
9009
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9010
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9011
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9012
|
+
/* .context = */ nullptr,
|
9013
|
+
};
|
9014
|
+
|
9015
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9016
|
+
}
|
9017
|
+
|
9018
|
+
// backend
|
9019
|
+
|
9020
|
+
struct ggml_backend_context_cuda {
|
9021
|
+
int device;
|
9022
|
+
};
|
9023
|
+
|
9024
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9025
|
+
return GGML_CUDA_NAME;
|
9026
|
+
|
8359
9027
|
UNUSED(backend);
|
8360
9028
|
}
|
8361
9029
|
|
9030
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9031
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9032
|
+
|
9033
|
+
delete cuda_ctx;
|
9034
|
+
delete backend;
|
9035
|
+
}
|
9036
|
+
|
9037
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9038
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9039
|
+
|
9040
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9041
|
+
}
|
9042
|
+
|
8362
9043
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9044
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9045
|
+
|
9046
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8363
9047
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8364
9048
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8365
9049
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8366
9050
|
|
8367
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8368
|
-
|
8369
|
-
UNUSED(backend);
|
9051
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8370
9052
|
}
|
8371
9053
|
|
8372
9054
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9055
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9056
|
+
|
9057
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8373
9058
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8374
9059
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8375
9060
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8376
9061
|
|
8377
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8378
|
-
|
8379
|
-
UNUSED(backend);
|
9062
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8380
9063
|
}
|
8381
9064
|
|
8382
9065
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8383
|
-
|
9066
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9067
|
+
|
9068
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8384
9069
|
|
8385
9070
|
UNUSED(backend);
|
8386
9071
|
}
|
@@ -8394,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8394
9079
|
UNUSED(cgraph);
|
8395
9080
|
}
|
8396
9081
|
|
8397
|
-
|
9082
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8398
9083
|
GGML_ASSERT(!"not implemented");
|
8399
9084
|
|
8400
9085
|
UNUSED(backend);
|
8401
9086
|
UNUSED(plan);
|
8402
9087
|
}
|
8403
9088
|
|
8404
|
-
|
9089
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8405
9090
|
GGML_ASSERT(!"not implemented");
|
8406
9091
|
|
8407
9092
|
UNUSED(backend);
|
@@ -8409,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8409
9094
|
}
|
8410
9095
|
|
8411
9096
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8412
|
-
|
9097
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9098
|
+
|
9099
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8413
9100
|
|
8414
9101
|
ggml_compute_params params = {};
|
8415
9102
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8417,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8417
9104
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8418
9105
|
ggml_tensor * node = cgraph->nodes[i];
|
8419
9106
|
|
8420
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9107
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8421
9108
|
continue;
|
8422
|
-
|
9109
|
+
|
8423
9110
|
assert(node->backend == GGML_BACKEND_GPU);
|
9111
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9112
|
+
assert(node->extra != nullptr);
|
9113
|
+
|
8424
9114
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8425
9115
|
if (node->src[j] != nullptr) {
|
8426
9116
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9117
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9118
|
+
assert(node->src[j]->extra != nullptr);
|
8427
9119
|
}
|
8428
9120
|
}
|
8429
9121
|
|
@@ -8460,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8460
9152
|
UNUSED(backend);
|
8461
9153
|
}
|
8462
9154
|
|
9155
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9156
|
+
switch (op->op) {
|
9157
|
+
case GGML_OP_UNARY:
|
9158
|
+
switch (ggml_get_unary_op(op)) {
|
9159
|
+
case GGML_UNARY_OP_GELU:
|
9160
|
+
case GGML_UNARY_OP_SILU:
|
9161
|
+
case GGML_UNARY_OP_RELU:
|
9162
|
+
return true;
|
9163
|
+
default:
|
9164
|
+
return false;
|
9165
|
+
}
|
9166
|
+
break;
|
9167
|
+
case GGML_OP_MUL_MAT:
|
9168
|
+
case GGML_OP_MUL_MAT_ID:
|
9169
|
+
{
|
9170
|
+
struct ggml_tensor * a;
|
9171
|
+
struct ggml_tensor * b;
|
9172
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9173
|
+
a = op->src[0];
|
9174
|
+
b = op->src[1];
|
9175
|
+
} else {
|
9176
|
+
a = op->src[2];
|
9177
|
+
b = op->src[1];
|
9178
|
+
}
|
9179
|
+
if (a->ne[3] != b->ne[3]) {
|
9180
|
+
return false;
|
9181
|
+
}
|
9182
|
+
return true;
|
9183
|
+
} break;
|
9184
|
+
case GGML_OP_NONE:
|
9185
|
+
case GGML_OP_RESHAPE:
|
9186
|
+
case GGML_OP_VIEW:
|
9187
|
+
case GGML_OP_PERMUTE:
|
9188
|
+
case GGML_OP_TRANSPOSE:
|
9189
|
+
case GGML_OP_NORM:
|
9190
|
+
case GGML_OP_REPEAT:
|
9191
|
+
case GGML_OP_GET_ROWS:
|
9192
|
+
case GGML_OP_DUP:
|
9193
|
+
case GGML_OP_ADD:
|
9194
|
+
case GGML_OP_MUL:
|
9195
|
+
case GGML_OP_DIV:
|
9196
|
+
case GGML_OP_RMS_NORM:
|
9197
|
+
case GGML_OP_SCALE:
|
9198
|
+
case GGML_OP_SQR:
|
9199
|
+
case GGML_OP_CLAMP:
|
9200
|
+
case GGML_OP_CPY:
|
9201
|
+
case GGML_OP_CONT:
|
9202
|
+
case GGML_OP_DIAG_MASK_INF:
|
9203
|
+
case GGML_OP_SOFT_MAX:
|
9204
|
+
case GGML_OP_ROPE:
|
9205
|
+
case GGML_OP_ALIBI:
|
9206
|
+
case GGML_OP_IM2COL:
|
9207
|
+
case GGML_OP_SUM_ROWS:
|
9208
|
+
case GGML_OP_ARGSORT:
|
9209
|
+
return true;
|
9210
|
+
default:
|
9211
|
+
return false;
|
9212
|
+
}
|
9213
|
+
|
9214
|
+
UNUSED(backend);
|
9215
|
+
}
|
9216
|
+
|
8463
9217
|
static ggml_backend_i cuda_backend_i = {
|
8464
|
-
/* .get_name
|
8465
|
-
/* .free
|
8466
|
-
/* .
|
8467
|
-
/* .
|
8468
|
-
/* .
|
8469
|
-
/* .
|
8470
|
-
/* .
|
8471
|
-
/* .
|
8472
|
-
/* .
|
8473
|
-
/* .
|
8474
|
-
/* .
|
8475
|
-
/* .
|
8476
|
-
/* .
|
8477
|
-
/* .supports_op = */ nullptr,
|
9218
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9219
|
+
/* .free = */ ggml_backend_cuda_free,
|
9220
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9221
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9222
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9223
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9224
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9225
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9226
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9227
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9228
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9229
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9230
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8478
9231
|
};
|
8479
9232
|
|
8480
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9233
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8481
9234
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8482
9235
|
|
8483
|
-
|
9236
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9237
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9238
|
+
return nullptr;
|
9239
|
+
}
|
9240
|
+
|
9241
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9242
|
+
ggml_cuda_set_main_device(device);
|
9243
|
+
|
9244
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9245
|
+
/* .device = */ device
|
9246
|
+
};
|
8484
9247
|
|
8485
9248
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8486
9249
|
/* .interface = */ cuda_backend_i,
|
@@ -8489,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8489
9252
|
|
8490
9253
|
return cuda_backend;
|
8491
9254
|
}
|
9255
|
+
|
9256
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9257
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9258
|
+
}
|
9259
|
+
|
9260
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9261
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9262
|
+
return cuda_backend;
|
9263
|
+
|
9264
|
+
UNUSED(params);
|
9265
|
+
}
|
9266
|
+
|
9267
|
+
extern "C" int ggml_backend_cuda_reg_devices() {
|
9268
|
+
int device_count = ggml_cuda_get_device_count();
|
9269
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
|
+
for (int i = 0; i < device_count; i++) {
|
9271
|
+
char name[128];
|
9272
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9273
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9274
|
+
}
|
9275
|
+
return device_count;
|
9276
|
+
}
|