llama_cpp 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
@@ -1,7 +1,8 @@
|
|
1
1
|
#include <algorithm>
|
2
|
-
#include <cinttypes>
|
3
2
|
#include <cstddef>
|
4
3
|
#include <cstdint>
|
4
|
+
#include <cinttypes>
|
5
|
+
#include <float.h>
|
5
6
|
#include <limits>
|
6
7
|
#include <stdint.h>
|
7
8
|
#include <stdio.h>
|
@@ -69,6 +70,7 @@
|
|
69
70
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
71
|
#define cudaSetDevice hipSetDevice
|
71
72
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
73
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
74
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
75
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
76
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
192
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
193
|
cudaGetErrorString(err_)); \
|
192
194
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
195
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
196
|
} \
|
195
197
|
} while (0)
|
196
198
|
|
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
206
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
207
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
208
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
209
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
210
|
} \
|
209
211
|
} while (0)
|
210
212
|
#else
|
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
218
|
cudaGetDevice(&id); \
|
217
219
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
220
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
221
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
222
|
} \
|
221
223
|
} while (0)
|
222
224
|
#endif // CUDART_VERSION >= 11
|
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
435
|
#define WARP_SIZE 32
|
434
436
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
437
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
440
|
#define CUDA_RELU_BLOCK_SIZE 256
|
@@ -527,40 +527,87 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
527
527
|
return x;
|
528
528
|
}
|
529
529
|
|
530
|
-
static
|
531
|
-
|
530
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
531
|
+
return b;
|
532
|
+
}
|
532
533
|
|
533
|
-
|
534
|
-
|
535
|
-
}
|
536
|
-
dst[i] = x[i] + y[i%ky];
|
534
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
535
|
+
return a + b;
|
537
536
|
}
|
538
537
|
|
539
|
-
static
|
540
|
-
|
538
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
539
|
+
return a * b;
|
540
|
+
}
|
541
541
|
|
542
|
-
|
543
|
-
|
544
|
-
}
|
545
|
-
dst[i] = __hadd(x[i], __float2half(y[i]));
|
542
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
543
|
+
return a / b;
|
546
544
|
}
|
547
545
|
|
548
|
-
|
549
|
-
|
546
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
547
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
548
|
+
int ne0, int ne1, int ne2, int ne3,
|
549
|
+
int ne10, int ne11, int ne12, int ne13,
|
550
|
+
/*int s0, */ int s1, int s2, int s3,
|
551
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
552
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
553
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
554
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
555
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
550
556
|
|
551
|
-
if (
|
557
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
552
558
|
return;
|
553
559
|
}
|
554
|
-
|
560
|
+
|
561
|
+
const int i11 = i1 % ne11;
|
562
|
+
const int i12 = i2 % ne12;
|
563
|
+
const int i13 = i3 % ne13;
|
564
|
+
|
565
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
566
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
567
|
+
const size_t i_dst = i_src0;
|
568
|
+
|
569
|
+
const src0_t * src0_row = src0 + i_src0;
|
570
|
+
const src1_t * src1_row = src1 + i_src1;
|
571
|
+
dst_t * dst_row = dst + i_dst;
|
572
|
+
|
573
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
574
|
+
const int i10 = i0 % ne10;
|
575
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
576
|
+
}
|
555
577
|
}
|
556
578
|
|
557
|
-
|
579
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
580
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
581
|
+
int ne0, int ne1, int ne2, int ne3,
|
582
|
+
int ne10, int ne11, int ne12, int ne13,
|
583
|
+
/*int s0, */ int s1, int s2, int s3,
|
584
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
585
|
+
|
558
586
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
559
587
|
|
560
|
-
|
588
|
+
const int i3 = i/(ne2*ne1*ne0);
|
589
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
590
|
+
const int i1 = (i/ne0) % ne1;
|
591
|
+
const int i0 = i % ne0;
|
592
|
+
|
593
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
561
594
|
return;
|
562
595
|
}
|
563
|
-
|
596
|
+
|
597
|
+
const int i11 = i1 % ne11;
|
598
|
+
const int i12 = i2 % ne12;
|
599
|
+
const int i13 = i3 % ne13;
|
600
|
+
|
601
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
602
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
603
|
+
const size_t i_dst = i_src0;
|
604
|
+
|
605
|
+
const src0_t * src0_row = src0 + i_src0;
|
606
|
+
const src1_t * src1_row = src1 + i_src1;
|
607
|
+
dst_t * dst_row = dst + i_dst;
|
608
|
+
|
609
|
+
const int i10 = i0 % ne10;
|
610
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
564
611
|
}
|
565
612
|
|
566
613
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -604,12 +651,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
604
651
|
}
|
605
652
|
|
606
653
|
template <int block_size>
|
607
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
654
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
608
655
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
609
656
|
const int tid = threadIdx.x;
|
610
657
|
|
611
|
-
const float eps = 1e-5f;
|
612
|
-
|
613
658
|
float2 mean_var = make_float2(0.f, 0.f);
|
614
659
|
|
615
660
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -4559,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4559
4604
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4560
4605
|
}
|
4561
4606
|
|
4607
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4608
|
+
const float * xi = (const float *) cxi;
|
4609
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4610
|
+
|
4611
|
+
float amax = 0.0f; // absolute max
|
4612
|
+
|
4613
|
+
for (int j = 0; j < QK8_0; j++) {
|
4614
|
+
const float v = xi[j];
|
4615
|
+
amax = fmaxf(amax, fabsf(v));
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const float d = amax / ((1 << 7) - 1);
|
4619
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4620
|
+
|
4621
|
+
dsti->d = d;
|
4622
|
+
|
4623
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4624
|
+
const float x0 = xi[j]*id;
|
4625
|
+
|
4626
|
+
dsti->qs[j] = roundf(x0);
|
4627
|
+
}
|
4628
|
+
}
|
4629
|
+
|
4630
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4631
|
+
const float * xi = (const float *) cxi;
|
4632
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4633
|
+
|
4634
|
+
float amax = 0.0f;
|
4635
|
+
float vmax = 0.0f;
|
4636
|
+
|
4637
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4638
|
+
const float v = xi[j];
|
4639
|
+
if (amax < fabsf(v)) {
|
4640
|
+
amax = fabsf(v);
|
4641
|
+
vmax = v;
|
4642
|
+
}
|
4643
|
+
}
|
4644
|
+
|
4645
|
+
const float d = vmax / -8;
|
4646
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4647
|
+
|
4648
|
+
dsti->d = d;
|
4649
|
+
|
4650
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4651
|
+
const float x0 = xi[0 + j]*id;
|
4652
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4653
|
+
|
4654
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4655
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4656
|
+
|
4657
|
+
dsti->qs[j] = xi0;
|
4658
|
+
dsti->qs[j] |= xi1 << 4;
|
4659
|
+
}
|
4660
|
+
}
|
4661
|
+
|
4662
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4663
|
+
const float * xi = (const float *) cxi;
|
4664
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4665
|
+
|
4666
|
+
float vmin = FLT_MAX;
|
4667
|
+
float vmax = -FLT_MAX;
|
4668
|
+
|
4669
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4670
|
+
const float v = xi[j];
|
4671
|
+
|
4672
|
+
if (v < vmin) vmin = v;
|
4673
|
+
if (v > vmax) vmax = v;
|
4674
|
+
}
|
4675
|
+
|
4676
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4677
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4678
|
+
|
4679
|
+
dsti->dm.x = d;
|
4680
|
+
dsti->dm.y = vmin;
|
4681
|
+
|
4682
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4683
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4684
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4685
|
+
|
4686
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4687
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4688
|
+
|
4689
|
+
dsti->qs[j] = xi0;
|
4690
|
+
dsti->qs[j] |= xi1 << 4;
|
4691
|
+
}
|
4692
|
+
}
|
4693
|
+
|
4694
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4695
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4696
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4697
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4698
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4699
|
+
|
4700
|
+
if (i >= ne) {
|
4701
|
+
return;
|
4702
|
+
}
|
4703
|
+
|
4704
|
+
const int i02 = i / (ne00*ne01);
|
4705
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4706
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4707
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4708
|
+
|
4709
|
+
const int i12 = i / (ne10*ne11);
|
4710
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4711
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4712
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4713
|
+
|
4714
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4715
|
+
}
|
4716
|
+
|
4562
4717
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4563
4718
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4564
4719
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4713,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4713
4868
|
dst[i] = col * m_k + x[i];
|
4714
4869
|
}
|
4715
4870
|
|
4871
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
4872
|
+
const int row = blockIdx.y;
|
4873
|
+
const int col = threadIdx.x;
|
4874
|
+
|
4875
|
+
float sum = 0.0f;
|
4876
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
4877
|
+
sum += x[row * ncols + i];
|
4878
|
+
}
|
4879
|
+
|
4880
|
+
sum = warp_reduce_sum(sum);
|
4881
|
+
|
4882
|
+
if (col == 0) {
|
4883
|
+
dst[row] = sum;
|
4884
|
+
}
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
template<typename T>
|
4888
|
+
static inline __device__ void swap(T & a, T & b) {
|
4889
|
+
T tmp = a;
|
4890
|
+
a = b;
|
4891
|
+
b = tmp;
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
template<ggml_sort_order order>
|
4895
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
4896
|
+
// bitonic sort
|
4897
|
+
int col = threadIdx.x;
|
4898
|
+
int row = blockIdx.y;
|
4899
|
+
|
4900
|
+
if (col >= ncols) return;
|
4901
|
+
|
4902
|
+
const float * x_row = x + row * ncols;
|
4903
|
+
int * dst_row = dst + row * ncols;
|
4904
|
+
|
4905
|
+
// initialize indices
|
4906
|
+
if (col < ncols) {
|
4907
|
+
dst_row[col] = col;
|
4908
|
+
}
|
4909
|
+
__syncthreads();
|
4910
|
+
|
4911
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
4912
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
4913
|
+
int ixj = col ^ j;
|
4914
|
+
if (ixj > col) {
|
4915
|
+
if ((col & k) == 0) {
|
4916
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
4917
|
+
swap(dst_row[col], dst_row[ixj]);
|
4918
|
+
}
|
4919
|
+
} else {
|
4920
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
4921
|
+
swap(dst_row[col], dst_row[ixj]);
|
4922
|
+
}
|
4923
|
+
}
|
4924
|
+
}
|
4925
|
+
__syncthreads();
|
4926
|
+
}
|
4927
|
+
}
|
4928
|
+
}
|
4929
|
+
|
4716
4930
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4717
4931
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4718
4932
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4722,8 +4936,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4722
4936
|
}
|
4723
4937
|
|
4724
4938
|
const int i = row*ncols + col;
|
4725
|
-
//
|
4726
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4939
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
4940
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4941
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4727
4942
|
}
|
4728
4943
|
|
4729
4944
|
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
@@ -4845,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
|
|
4845
5060
|
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4846
5061
|
}
|
4847
5062
|
|
4848
|
-
|
4849
|
-
|
4850
|
-
|
4851
|
-
|
4852
|
-
|
4853
|
-
|
4854
|
-
|
4855
|
-
|
4856
|
-
|
4857
|
-
|
4858
|
-
|
4859
|
-
|
4860
|
-
|
4861
|
-
|
5063
|
+
template<float (*bin_op)(const float, const float)>
|
5064
|
+
struct bin_bcast_cuda {
|
5065
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5066
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5067
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5068
|
+
cudaStream_t stream) {
|
5069
|
+
|
5070
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
|
+
|
5072
|
+
|
5073
|
+
int nr0 = ne10/ne0;
|
5074
|
+
int nr1 = ne11/ne1;
|
5075
|
+
int nr2 = ne12/ne2;
|
5076
|
+
int nr3 = ne13/ne3;
|
5077
|
+
|
5078
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5079
|
+
|
5080
|
+
// collapse dimensions until first broadcast dimension
|
5081
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5082
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5083
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5084
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5085
|
+
auto collapse = [](int64_t cne[]) {
|
5086
|
+
cne[0] *= cne[1];
|
5087
|
+
cne[1] = cne[2];
|
5088
|
+
cne[2] = cne[3];
|
5089
|
+
cne[3] = 1;
|
5090
|
+
};
|
5091
|
+
|
5092
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5093
|
+
cnb[1] *= cne[1];
|
5094
|
+
cnb[2] *= cne[2];
|
5095
|
+
cnb[3] *= cne[3];
|
5096
|
+
};
|
5097
|
+
|
5098
|
+
for (int i = 0; i < 4; i++) {
|
5099
|
+
if (nr[i] != 1) {
|
5100
|
+
break;
|
5101
|
+
}
|
5102
|
+
if (i > 0) {
|
5103
|
+
collapse_nb(cnb0, cne0);
|
5104
|
+
collapse_nb(cnb1, cne1);
|
5105
|
+
collapse(cne0);
|
5106
|
+
collapse(cne1);
|
5107
|
+
}
|
5108
|
+
}
|
5109
|
+
{
|
5110
|
+
int64_t ne0 = cne0[0];
|
5111
|
+
int64_t ne1 = cne0[1];
|
5112
|
+
int64_t ne2 = cne0[2];
|
5113
|
+
int64_t ne3 = cne0[3];
|
5114
|
+
|
5115
|
+
int64_t ne10 = cne1[0];
|
5116
|
+
int64_t ne11 = cne1[1];
|
5117
|
+
int64_t ne12 = cne1[2];
|
5118
|
+
int64_t ne13 = cne1[3];
|
5119
|
+
|
5120
|
+
//size_t nb0 = cnb0[0];
|
5121
|
+
size_t nb1 = cnb0[1];
|
5122
|
+
size_t nb2 = cnb0[2];
|
5123
|
+
size_t nb3 = cnb0[3];
|
5124
|
+
|
5125
|
+
//size_t nb10 = cnb1[0];
|
5126
|
+
size_t nb11 = cnb1[1];
|
5127
|
+
size_t nb12 = cnb1[2];
|
5128
|
+
size_t nb13 = cnb1[3];
|
5129
|
+
|
5130
|
+
//size_t s0 = nb0 / sizeof(src1_t);
|
5131
|
+
size_t s1 = nb1 / sizeof(src1_t);
|
5132
|
+
size_t s2 = nb2 / sizeof(src1_t);
|
5133
|
+
size_t s3 = nb3 / sizeof(src1_t);
|
5134
|
+
|
5135
|
+
//size_t s10 = nb10 / sizeof(src1_t);
|
5136
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5137
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5138
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5139
|
+
|
5140
|
+
|
5141
|
+
const int block_size = 128;
|
5142
|
+
|
5143
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5144
|
+
|
5145
|
+
dim3 block_dims;
|
5146
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5147
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5148
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5149
|
+
|
5150
|
+
dim3 block_nums(
|
5151
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5152
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5153
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5154
|
+
);
|
4862
5155
|
|
4863
|
-
|
4864
|
-
|
4865
|
-
|
4866
|
-
|
5156
|
+
if (block_nums.z > 65535) {
|
5157
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5158
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5159
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5160
|
+
src0_dd, src1_dd, dst_dd,
|
5161
|
+
ne0, ne1, ne2, ne3,
|
5162
|
+
ne10, ne11, ne12, ne13,
|
5163
|
+
/* s0, */ s1, s2, s3,
|
5164
|
+
/* s10, */ s11, s12, s13);
|
5165
|
+
} else {
|
5166
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5167
|
+
src0_dd, src1_dd, dst_dd,
|
5168
|
+
ne0, ne1, ne2, ne3,
|
5169
|
+
ne10, ne11, ne12, ne13,
|
5170
|
+
/* s0, */ s1, s2, s3,
|
5171
|
+
/* s10, */ s11, s12, s13);
|
5172
|
+
}
|
5173
|
+
}
|
5174
|
+
}
|
5175
|
+
};
|
4867
5176
|
|
4868
5177
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4869
5178
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
@@ -4885,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
4885
5194
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4886
5195
|
}
|
4887
5196
|
|
4888
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5197
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4889
5198
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4890
5199
|
if (ncols < 1024) {
|
4891
5200
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4892
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5201
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4893
5202
|
} else {
|
4894
5203
|
const dim3 block_dims(1024, 1, 1);
|
4895
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5204
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4896
5205
|
}
|
4897
5206
|
}
|
4898
5207
|
|
@@ -4914,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4914
5223
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4915
5224
|
}
|
4916
5225
|
|
4917
|
-
template<typename dst_t>
|
4918
|
-
static void
|
4919
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4920
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4921
|
-
}
|
4922
|
-
|
4923
|
-
template<typename dst_t>
|
4924
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4925
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4926
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4927
|
-
}
|
4928
|
-
|
4929
|
-
template<typename dst_t>
|
4930
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4931
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4932
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4933
|
-
}
|
4934
|
-
|
4935
|
-
template<typename dst_t>
|
4936
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4937
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4938
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4939
|
-
}
|
4940
|
-
|
4941
|
-
template<typename dst_t>
|
4942
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5226
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5227
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4943
5228
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4944
|
-
dequantize_block<
|
5229
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4945
5230
|
}
|
4946
5231
|
|
4947
5232
|
template<typename dst_t>
|
@@ -4990,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4990
5275
|
#endif
|
4991
5276
|
}
|
4992
5277
|
|
5278
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5279
|
+
switch (type) {
|
5280
|
+
case GGML_TYPE_Q4_0:
|
5281
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5282
|
+
case GGML_TYPE_Q4_1:
|
5283
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5284
|
+
case GGML_TYPE_Q5_0:
|
5285
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5286
|
+
case GGML_TYPE_Q5_1:
|
5287
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5288
|
+
case GGML_TYPE_Q8_0:
|
5289
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5290
|
+
case GGML_TYPE_Q2_K:
|
5291
|
+
return dequantize_row_q2_K_cuda;
|
5292
|
+
case GGML_TYPE_Q3_K:
|
5293
|
+
return dequantize_row_q3_K_cuda;
|
5294
|
+
case GGML_TYPE_Q4_K:
|
5295
|
+
return dequantize_row_q4_K_cuda;
|
5296
|
+
case GGML_TYPE_Q5_K:
|
5297
|
+
return dequantize_row_q5_K_cuda;
|
5298
|
+
case GGML_TYPE_Q6_K:
|
5299
|
+
return dequantize_row_q6_K_cuda;
|
5300
|
+
case GGML_TYPE_F32:
|
5301
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5302
|
+
default:
|
5303
|
+
return nullptr;
|
5304
|
+
}
|
5305
|
+
}
|
5306
|
+
|
5307
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5308
|
+
switch (type) {
|
5309
|
+
case GGML_TYPE_Q4_0:
|
5310
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5311
|
+
case GGML_TYPE_Q4_1:
|
5312
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5313
|
+
case GGML_TYPE_Q5_0:
|
5314
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5315
|
+
case GGML_TYPE_Q5_1:
|
5316
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5317
|
+
case GGML_TYPE_Q8_0:
|
5318
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5319
|
+
case GGML_TYPE_Q2_K:
|
5320
|
+
return dequantize_row_q2_K_cuda;
|
5321
|
+
case GGML_TYPE_Q3_K:
|
5322
|
+
return dequantize_row_q3_K_cuda;
|
5323
|
+
case GGML_TYPE_Q4_K:
|
5324
|
+
return dequantize_row_q4_K_cuda;
|
5325
|
+
case GGML_TYPE_Q5_K:
|
5326
|
+
return dequantize_row_q5_K_cuda;
|
5327
|
+
case GGML_TYPE_Q6_K:
|
5328
|
+
return dequantize_row_q6_K_cuda;
|
5329
|
+
case GGML_TYPE_F16:
|
5330
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5331
|
+
default:
|
5332
|
+
return nullptr;
|
5333
|
+
}
|
5334
|
+
}
|
5335
|
+
|
4993
5336
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4994
5337
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4995
5338
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5078,6 +5421,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5078
5421
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5079
5422
|
}
|
5080
5423
|
|
5424
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5425
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5426
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5427
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5428
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5429
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5430
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5431
|
+
}
|
5432
|
+
|
5081
5433
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5082
5434
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
5083
5435
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5168,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5168
5520
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5169
5521
|
}
|
5170
5522
|
|
5171
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5172
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5173
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5174
|
-
}
|
5175
|
-
|
5176
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5177
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5178
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5179
|
-
}
|
5180
|
-
|
5181
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5182
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5183
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5184
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5185
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5186
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5187
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5188
|
-
}
|
5189
|
-
|
5190
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5191
|
-
switch (type) {
|
5192
|
-
case GGML_TYPE_Q4_0:
|
5193
|
-
return dequantize_row_q4_0_cuda;
|
5194
|
-
case GGML_TYPE_Q4_1:
|
5195
|
-
return dequantize_row_q4_1_cuda;
|
5196
|
-
case GGML_TYPE_Q5_0:
|
5197
|
-
return dequantize_row_q5_0_cuda;
|
5198
|
-
case GGML_TYPE_Q5_1:
|
5199
|
-
return dequantize_row_q5_1_cuda;
|
5200
|
-
case GGML_TYPE_Q8_0:
|
5201
|
-
return dequantize_row_q8_0_cuda;
|
5202
|
-
case GGML_TYPE_Q2_K:
|
5203
|
-
return dequantize_row_q2_K_cuda;
|
5204
|
-
case GGML_TYPE_Q3_K:
|
5205
|
-
return dequantize_row_q3_K_cuda;
|
5206
|
-
case GGML_TYPE_Q4_K:
|
5207
|
-
return dequantize_row_q4_K_cuda;
|
5208
|
-
case GGML_TYPE_Q5_K:
|
5209
|
-
return dequantize_row_q5_K_cuda;
|
5210
|
-
case GGML_TYPE_Q6_K:
|
5211
|
-
return dequantize_row_q6_K_cuda;
|
5212
|
-
case GGML_TYPE_F32:
|
5213
|
-
return convert_fp32_to_fp16_cuda;
|
5214
|
-
default:
|
5215
|
-
return nullptr;
|
5216
|
-
}
|
5217
|
-
}
|
5218
|
-
|
5219
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5220
|
-
switch (type) {
|
5221
|
-
case GGML_TYPE_Q4_0:
|
5222
|
-
return dequantize_row_q4_0_cuda;
|
5223
|
-
case GGML_TYPE_Q4_1:
|
5224
|
-
return dequantize_row_q4_1_cuda;
|
5225
|
-
case GGML_TYPE_Q5_0:
|
5226
|
-
return dequantize_row_q5_0_cuda;
|
5227
|
-
case GGML_TYPE_Q5_1:
|
5228
|
-
return dequantize_row_q5_1_cuda;
|
5229
|
-
case GGML_TYPE_Q8_0:
|
5230
|
-
return dequantize_row_q8_0_cuda;
|
5231
|
-
case GGML_TYPE_Q2_K:
|
5232
|
-
return dequantize_row_q2_K_cuda;
|
5233
|
-
case GGML_TYPE_Q3_K:
|
5234
|
-
return dequantize_row_q3_K_cuda;
|
5235
|
-
case GGML_TYPE_Q4_K:
|
5236
|
-
return dequantize_row_q4_K_cuda;
|
5237
|
-
case GGML_TYPE_Q5_K:
|
5238
|
-
return dequantize_row_q5_K_cuda;
|
5239
|
-
case GGML_TYPE_Q6_K:
|
5240
|
-
return dequantize_row_q6_K_cuda;
|
5241
|
-
case GGML_TYPE_F16:
|
5242
|
-
return convert_fp16_to_fp32_cuda;
|
5243
|
-
default:
|
5244
|
-
return nullptr;
|
5245
|
-
}
|
5246
|
-
}
|
5247
|
-
|
5248
5523
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5249
5524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5250
5525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5737,19 +6012,52 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5737
6012
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5738
6013
|
}
|
5739
6014
|
|
5740
|
-
static void
|
6015
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
5741
6016
|
const char * cx, char * cdst, const int ne,
|
5742
6017
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5743
6018
|
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5744
6019
|
|
5745
|
-
|
5746
|
-
|
6020
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6021
|
+
const int num_blocks = ne / QK8_0;
|
6022
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
5747
6023
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5748
6024
|
}
|
5749
6025
|
|
5750
|
-
static void
|
5751
|
-
const
|
5752
|
-
|
6026
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6027
|
+
const char * cx, char * cdst, const int ne,
|
6028
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6029
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6030
|
+
|
6031
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6032
|
+
const int num_blocks = ne / QK4_0;
|
6033
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6034
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6035
|
+
}
|
6036
|
+
|
6037
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6038
|
+
const char * cx, char * cdst, const int ne,
|
6039
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6040
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6041
|
+
|
6042
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6043
|
+
const int num_blocks = ne / QK4_1;
|
6044
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6045
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6046
|
+
}
|
6047
|
+
|
6048
|
+
static void ggml_cpy_f16_f16_cuda(
|
6049
|
+
const char * cx, char * cdst, const int ne,
|
6050
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6051
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6052
|
+
|
6053
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
6054
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
6055
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6056
|
+
}
|
6057
|
+
|
6058
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
6059
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
6060
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5753
6061
|
}
|
5754
6062
|
|
5755
6063
|
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
@@ -5823,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5823
6131
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5824
6132
|
}
|
5825
6133
|
|
6134
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6135
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6136
|
+
const dim3 block_nums(1, nrows, 1);
|
6137
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6138
|
+
}
|
6139
|
+
|
6140
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6141
|
+
// bitonic sort requires ncols to be power of 2
|
6142
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6143
|
+
|
6144
|
+
const dim3 block_dims(ncols, 1, 1);
|
6145
|
+
const dim3 block_nums(1, nrows, 1);
|
6146
|
+
if (order == GGML_SORT_ASC) {
|
6147
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6148
|
+
} else if (order == GGML_SORT_DESC) {
|
6149
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6150
|
+
} else {
|
6151
|
+
GGML_ASSERT(false);
|
6152
|
+
}
|
6153
|
+
}
|
6154
|
+
|
5826
6155
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5827
6156
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5828
6157
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5915,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5915
6244
|
return ptr;
|
5916
6245
|
}
|
5917
6246
|
#ifdef DEBUG_CUDA_MALLOC
|
5918
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6247
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5919
6248
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5920
6249
|
#endif
|
5921
6250
|
void * ptr;
|
@@ -6053,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6053
6382
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6054
6383
|
// This can fixed the OOM error in WSL.
|
6055
6384
|
cudaGetLastError();
|
6056
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6385
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6057
6386
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6058
6387
|
return nullptr;
|
6059
6388
|
}
|
@@ -6098,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6098
6427
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6099
6428
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6100
6429
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6101
|
-
}
|
6102
|
-
if (nb0 == ts) {
|
6430
|
+
} else if (nb0 == ts) {
|
6103
6431
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6104
|
-
}
|
6105
|
-
|
6106
|
-
|
6107
|
-
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
}
|
6112
|
-
return cudaSuccess;
|
6113
|
-
}
|
6114
|
-
|
6115
|
-
static void ggml_cuda_op_repeat(
|
6116
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6117
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6118
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6119
|
-
const int64_t ne0 = dst->ne[0];
|
6120
|
-
const int64_t ne1 = dst->ne[1];
|
6121
|
-
const int64_t ne2 = dst->ne[2];
|
6122
|
-
const int64_t ne3 = dst->ne[3];
|
6123
|
-
|
6124
|
-
const int64_t ne00 = src0->ne[0];
|
6125
|
-
const int64_t ne01 = src0->ne[1];
|
6126
|
-
const int64_t ne02 = src0->ne[2];
|
6127
|
-
const int64_t ne03 = src0->ne[3];
|
6128
|
-
|
6129
|
-
const size_t nb0 = dst->nb[0];
|
6130
|
-
const size_t nb1 = dst->nb[1];
|
6131
|
-
const size_t nb2 = dst->nb[2];
|
6132
|
-
const size_t nb3 = dst->nb[3];
|
6133
|
-
|
6134
|
-
const size_t nb00 = src0->nb[0];
|
6135
|
-
const size_t nb01 = src0->nb[1];
|
6136
|
-
const size_t nb02 = src0->nb[2];
|
6137
|
-
const size_t nb03 = src0->nb[3];
|
6138
|
-
|
6139
|
-
const int nr0 = (int)(ne0/ne00);
|
6140
|
-
const int nr1 = (int)(ne1/ne01);
|
6141
|
-
const int nr2 = (int)(ne2/ne02);
|
6142
|
-
const int nr3 = (int)(ne3/ne03);
|
6143
|
-
|
6144
|
-
// TODO: support for transposed / permuted tensors
|
6145
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6146
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6147
|
-
|
6148
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6149
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6150
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6151
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6152
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6153
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6154
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6155
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6156
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6157
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6158
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6159
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6160
|
-
}
|
6161
|
-
}
|
6162
|
-
}
|
6163
|
-
}
|
6164
|
-
}
|
6432
|
+
} else {
|
6433
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6434
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6435
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6436
|
+
// pretend the row is a matrix with cols=1
|
6437
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6438
|
+
if (r != cudaSuccess) return r;
|
6165
6439
|
}
|
6440
|
+
return cudaSuccess;
|
6166
6441
|
}
|
6167
|
-
|
6168
|
-
(void) src1;
|
6169
|
-
(void) src1_d;
|
6170
6442
|
}
|
6171
6443
|
|
6172
6444
|
static void ggml_cuda_op_get_rows(
|
@@ -6213,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
|
|
6213
6485
|
}
|
6214
6486
|
}
|
6215
6487
|
|
6216
|
-
|
6488
|
+
template<class op>
|
6489
|
+
inline void ggml_cuda_op_bin_bcast(
|
6217
6490
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6218
6491
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6219
6492
|
|
6220
6493
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6221
6494
|
|
6222
|
-
const int64_t ne10 = src1->ne[0];
|
6223
|
-
const int64_t ne11 = src1->ne[1];
|
6224
|
-
|
6225
6495
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6226
|
-
|
6496
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6227
6497
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6228
|
-
|
6498
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6229
6499
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6230
|
-
|
6500
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6231
6501
|
} else {
|
6232
|
-
fprintf(stderr, "src0
|
6502
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6503
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6233
6504
|
GGML_ASSERT(false);
|
6234
6505
|
}
|
6506
|
+
}
|
6507
|
+
|
6508
|
+
static void ggml_cuda_op_repeat(
|
6509
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6510
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6511
|
+
|
6512
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6235
6513
|
|
6236
6514
|
(void) src1;
|
6237
|
-
(void)
|
6515
|
+
(void) src1_d;
|
6238
6516
|
}
|
6239
6517
|
|
6240
|
-
inline void
|
6518
|
+
inline void ggml_cuda_op_add(
|
6241
6519
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6242
6520
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6243
6521
|
|
6244
|
-
|
6245
|
-
|
6246
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6522
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
|
+
}
|
6247
6524
|
|
6248
|
-
|
6249
|
-
const
|
6525
|
+
inline void ggml_cuda_op_mul(
|
6526
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6250
6528
|
|
6251
|
-
|
6529
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6530
|
+
}
|
6252
6531
|
|
6253
|
-
|
6532
|
+
inline void ggml_cuda_op_div(
|
6533
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6534
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6535
|
+
|
6536
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6254
6537
|
}
|
6255
6538
|
|
6256
6539
|
inline void ggml_cuda_op_gelu(
|
@@ -6319,7 +6602,10 @@ inline void ggml_cuda_op_norm(
|
|
6319
6602
|
const int64_t ne00 = src0->ne[0];
|
6320
6603
|
const int64_t nrows = ggml_nrows(src0);
|
6321
6604
|
|
6322
|
-
|
6605
|
+
float eps;
|
6606
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
6607
|
+
|
6608
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6323
6609
|
|
6324
6610
|
(void) src1;
|
6325
6611
|
(void) dst;
|
@@ -6474,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6474
6760
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6475
6761
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6476
6762
|
|
6763
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
6764
|
+
|
6477
6765
|
const int64_t ne00 = src0->ne[0];
|
6478
6766
|
const int64_t row_diff = row_high - row_low;
|
6479
6767
|
|
@@ -6533,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6533
6821
|
size_t ash;
|
6534
6822
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6535
6823
|
|
6536
|
-
bool src1_convert_f16 =
|
6824
|
+
bool src1_convert_f16 =
|
6825
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6537
6826
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6538
6827
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6539
6828
|
|
@@ -6859,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
|
|
6859
7148
|
(void) src0_dd;
|
6860
7149
|
}
|
6861
7150
|
|
7151
|
+
inline void ggml_cuda_op_sum_rows(
|
7152
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7154
|
+
|
7155
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7156
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7157
|
+
|
7158
|
+
const int64_t ncols = src0->ne[0];
|
7159
|
+
const int64_t nrows = ggml_nrows(src0);
|
7160
|
+
|
7161
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7162
|
+
|
7163
|
+
(void) src1;
|
7164
|
+
(void) dst;
|
7165
|
+
(void) src1_dd;
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
inline void ggml_cuda_op_argsort(
|
7169
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7170
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7171
|
+
|
7172
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7173
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7174
|
+
|
7175
|
+
const int64_t ncols = src0->ne[0];
|
7176
|
+
const int64_t nrows = ggml_nrows(src0);
|
7177
|
+
|
7178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7179
|
+
|
7180
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7181
|
+
|
7182
|
+
(void) src1;
|
7183
|
+
(void) dst;
|
7184
|
+
(void) src1_dd;
|
7185
|
+
}
|
7186
|
+
|
6862
7187
|
inline void ggml_cuda_op_diag_mask_inf(
|
6863
7188
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6864
7189
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7067,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7067
7392
|
const int64_t ne01 = src0->ne[1];
|
7068
7393
|
const int64_t ne02 = src0->ne[2];
|
7069
7394
|
const int64_t ne03 = src0->ne[3];
|
7070
|
-
|
7395
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7071
7396
|
|
7072
7397
|
const int64_t ne10 = src1->ne[0];
|
7073
7398
|
const int64_t ne11 = src1->ne[1];
|
@@ -7103,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7103
7428
|
|
7104
7429
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7105
7430
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7106
|
-
|
7107
7431
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7108
|
-
|
7109
|
-
|
7432
|
+
|
7433
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7110
7434
|
|
7111
7435
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7112
7436
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7231,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7231
7555
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7232
7556
|
|
7233
7557
|
// for split tensors the data begins at i0 == i0_offset_low
|
7234
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
7558
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7235
7559
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7236
7560
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7237
7561
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7376,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7376
7700
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7377
7701
|
}
|
7378
7702
|
|
7703
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7704
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
7705
|
+
}
|
7706
|
+
|
7379
7707
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7380
7708
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7381
7709
|
}
|
@@ -7401,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7401
7729
|
}
|
7402
7730
|
|
7403
7731
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7404
|
-
if (!g_cublas_loaded)
|
7732
|
+
if (!g_cublas_loaded) return false;
|
7405
7733
|
|
7406
7734
|
const int64_t ne10 = src1->ne[0];
|
7407
7735
|
|
@@ -7479,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7479
7807
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7480
7808
|
}
|
7481
7809
|
|
7482
|
-
__global__
|
7810
|
+
static __global__ void k_compute_batched_ptrs(
|
7483
7811
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7484
7812
|
const void ** ptrs_src, void ** ptrs_dst,
|
7485
7813
|
int ne12, int ne13,
|
@@ -7535,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7535
7863
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7536
7864
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7537
7865
|
|
7538
|
-
|
7539
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7540
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7866
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7541
7867
|
|
7542
7868
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7543
7869
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7594,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7594
7920
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7595
7921
|
// use cublasGemmStridedBatchedEx
|
7596
7922
|
CUBLAS_CHECK(
|
7597
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
7923
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7598
7924
|
ne01, ne11, ne10,
|
7599
7925
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7600
7926
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7628,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7628
7954
|
CUDA_CHECK(cudaGetLastError());
|
7629
7955
|
|
7630
7956
|
CUBLAS_CHECK(
|
7631
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
7957
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7632
7958
|
ne01, ne11, ne10,
|
7633
7959
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7634
7960
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7698,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7698
8024
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7699
8025
|
const bool use_mul_mat_vec_q = false;
|
7700
8026
|
#else
|
7701
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8027
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7702
8028
|
#endif // GGML_CUDA_FORCE_DMMV
|
7703
8029
|
|
7704
8030
|
if (use_mul_mat_vec_q) {
|
8031
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7705
8032
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7706
8033
|
} else {
|
7707
8034
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7726,6 +8053,219 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7726
8053
|
}
|
7727
8054
|
}
|
7728
8055
|
|
8056
|
+
#if 0
|
8057
|
+
template<typename ... Srcs>
|
8058
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8059
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8060
|
+
int ne12, int ne13,
|
8061
|
+
int ne23,
|
8062
|
+
int nb02, int nb03,
|
8063
|
+
int nb12, int nb13,
|
8064
|
+
int nb2, int nb3,
|
8065
|
+
int r2, int r3,
|
8066
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8067
|
+
const half * src1_f16, half * dst_f16,
|
8068
|
+
const int32_t * ids, const int id,
|
8069
|
+
Srcs... src0s) {
|
8070
|
+
|
8071
|
+
int i = ids[id];
|
8072
|
+
|
8073
|
+
half * src0_f16;
|
8074
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8075
|
+
if (src0_type == GGML_TYPE_F16) {
|
8076
|
+
src0_f16 = (half *) srcs_ar[i];
|
8077
|
+
} else {
|
8078
|
+
src0_f16 = src0_as_f16;
|
8079
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8080
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8081
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8082
|
+
}
|
8083
|
+
}
|
8084
|
+
|
8085
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8086
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8087
|
+
|
8088
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8089
|
+
return;
|
8090
|
+
}
|
8091
|
+
|
8092
|
+
int i03 = i13 / r3;
|
8093
|
+
int i02 = i12 / r2;
|
8094
|
+
|
8095
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8096
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8097
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
8098
|
+
}
|
8099
|
+
|
8100
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8101
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8102
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8103
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
8104
|
+
|
8105
|
+
const int id = dst->op_params[0];
|
8106
|
+
|
8107
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8108
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
8109
|
+
|
8110
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8111
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8112
|
+
|
8113
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8114
|
+
const int64_t ne01 = src00->ne[1];
|
8115
|
+
const int64_t ne02 = src00->ne[2];
|
8116
|
+
const int64_t ne03 = src00->ne[3];
|
8117
|
+
|
8118
|
+
//const int64_t nb01 = src00->nb[1];
|
8119
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8120
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
8121
|
+
|
8122
|
+
const int64_t ne10 = src1->ne[0];
|
8123
|
+
const int64_t ne11 = src1->ne[1];
|
8124
|
+
const int64_t ne12 = src1->ne[2];
|
8125
|
+
const int64_t ne13 = src1->ne[3];
|
8126
|
+
|
8127
|
+
//const int64_t nb11 = src1->nb[1];
|
8128
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8129
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8130
|
+
|
8131
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8132
|
+
const int64_t ne = ggml_nelements(dst);
|
8133
|
+
|
8134
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8135
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8136
|
+
|
8137
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8138
|
+
|
8139
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8140
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8141
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8142
|
+
|
8143
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8144
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8145
|
+
|
8146
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8147
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8148
|
+
|
8149
|
+
// convert src1 to fp16
|
8150
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8151
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8152
|
+
|
8153
|
+
size_t src1_as = 0;
|
8154
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8155
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8156
|
+
|
8157
|
+
size_t dst_as = 0;
|
8158
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8159
|
+
|
8160
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8161
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8162
|
+
|
8163
|
+
// broadcast factors
|
8164
|
+
const int64_t r2 = ne12/ne02;
|
8165
|
+
const int64_t r3 = ne13/ne03;
|
8166
|
+
|
8167
|
+
const half alpha_f16 = 1.0f;
|
8168
|
+
const half beta_f16 = 0.0f;
|
8169
|
+
|
8170
|
+
// use cublasGemmBatchedEx
|
8171
|
+
const int ne23 = ne12*ne13;
|
8172
|
+
|
8173
|
+
const void ** ptrs_src = nullptr;
|
8174
|
+
void ** ptrs_dst = nullptr;
|
8175
|
+
|
8176
|
+
size_t ptrs_src_s = 0;
|
8177
|
+
size_t ptrs_dst_s = 0;
|
8178
|
+
|
8179
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8180
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8181
|
+
|
8182
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8183
|
+
half * src0_as_f16 = nullptr;
|
8184
|
+
size_t src0_as = 0;
|
8185
|
+
if (src00->type != GGML_TYPE_F16) {
|
8186
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8187
|
+
}
|
8188
|
+
|
8189
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8190
|
+
dim3 block_dims(ne13, ne12);
|
8191
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8192
|
+
ptrs_src, ptrs_dst,
|
8193
|
+
ne12, ne13,
|
8194
|
+
ne23,
|
8195
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8196
|
+
nb12, nb13,
|
8197
|
+
dst->nb[2], dst->nb[3],
|
8198
|
+
r2, r3,
|
8199
|
+
src00->type, src0_as_f16, src0_ne,
|
8200
|
+
src1_as_f16, dst_f16,
|
8201
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8202
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8203
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8204
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8205
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8206
|
+
);
|
8207
|
+
CUDA_CHECK(cudaGetLastError());
|
8208
|
+
|
8209
|
+
CUBLAS_CHECK(
|
8210
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8211
|
+
ne01, ne11, ne10,
|
8212
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8213
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8214
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8215
|
+
ne23,
|
8216
|
+
CUBLAS_COMPUTE_16F,
|
8217
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8218
|
+
|
8219
|
+
if (src0_as != 0) {
|
8220
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8221
|
+
}
|
8222
|
+
if (ptrs_src_s != 0) {
|
8223
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8224
|
+
}
|
8225
|
+
if (ptrs_dst_s != 0) {
|
8226
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8227
|
+
}
|
8228
|
+
|
8229
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8230
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8231
|
+
|
8232
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8233
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8234
|
+
}
|
8235
|
+
#endif
|
8236
|
+
|
8237
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
|
8238
|
+
#if 0
|
8239
|
+
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
+
// const bool use_tensor_cores = true;
|
8241
|
+
//#else
|
8242
|
+
// const bool use_tensor_cores = false;
|
8243
|
+
//#endif
|
8244
|
+
|
8245
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
+
|
8247
|
+
// TODO: mmq/mmv support
|
8248
|
+
#else
|
8249
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
+
const int id = dst->op_params[0];
|
8252
|
+
|
8253
|
+
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8254
|
+
|
8255
|
+
int32_t a_id;
|
8256
|
+
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8257
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8258
|
+
|
8259
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
8260
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8261
|
+
|
8262
|
+
ggml_cuda_mul_mat(src0, src1, dst);
|
8263
|
+
#endif
|
8264
|
+
|
8265
|
+
(void) _src0;
|
8266
|
+
(void) _src1;
|
8267
|
+
}
|
8268
|
+
|
7729
8269
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7730
8270
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7731
8271
|
}
|
@@ -7770,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7770
8310
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7771
8311
|
|
7772
8312
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7773
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7774
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8313
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7775
8314
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7776
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7777
|
-
|
8315
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8316
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8317
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8318
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8319
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8320
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8321
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7778
8322
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7779
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7780
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8323
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7781
8324
|
} else {
|
7782
8325
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7783
8326
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7788,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7788
8331
|
}
|
7789
8332
|
|
7790
8333
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8334
|
+
// TODO: why do we pass dst as src1 here?
|
7791
8335
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7792
8336
|
(void) src1;
|
7793
8337
|
}
|
@@ -7813,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7813
8357
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7814
8358
|
}
|
7815
8359
|
|
8360
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8361
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8362
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8363
|
+
}
|
8364
|
+
|
8365
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8366
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8367
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8368
|
+
}
|
8369
|
+
|
7816
8370
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7817
8371
|
(void) src0;
|
7818
8372
|
(void) src1;
|
@@ -8068,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8068
8622
|
main_device, g_device_count, g_main_device);
|
8069
8623
|
return;
|
8070
8624
|
}
|
8071
|
-
|
8072
|
-
if (g_device_count > 1) {
|
8625
|
+
|
8626
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
8627
|
+
g_main_device = main_device;
|
8073
8628
|
cudaDeviceProp prop;
|
8074
8629
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8075
8630
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8095,7 +8650,7 @@ void ggml_cuda_free_scratch() {
|
|
8095
8650
|
}
|
8096
8651
|
|
8097
8652
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8098
|
-
if (!g_cublas_loaded)
|
8653
|
+
if (!g_cublas_loaded) return false;
|
8099
8654
|
|
8100
8655
|
ggml_cuda_func_t func;
|
8101
8656
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8131,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8131
8686
|
case GGML_OP_MUL:
|
8132
8687
|
func = ggml_cuda_mul;
|
8133
8688
|
break;
|
8689
|
+
case GGML_OP_DIV:
|
8690
|
+
func = ggml_cuda_div;
|
8691
|
+
break;
|
8134
8692
|
case GGML_OP_UNARY:
|
8135
8693
|
switch (ggml_get_unary_op(tensor)) {
|
8136
8694
|
case GGML_UNARY_OP_GELU:
|
@@ -8144,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8144
8702
|
break;
|
8145
8703
|
default:
|
8146
8704
|
return false;
|
8147
|
-
}
|
8705
|
+
}
|
8706
|
+
break;
|
8148
8707
|
case GGML_OP_NORM:
|
8149
8708
|
func = ggml_cuda_norm;
|
8150
8709
|
break;
|
@@ -8157,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8157
8716
|
}
|
8158
8717
|
func = ggml_cuda_mul_mat;
|
8159
8718
|
break;
|
8719
|
+
case GGML_OP_MUL_MAT_ID:
|
8720
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
8721
|
+
return false;
|
8722
|
+
}
|
8723
|
+
func = ggml_cuda_mul_mat_id;
|
8724
|
+
break;
|
8160
8725
|
case GGML_OP_SCALE:
|
8161
8726
|
func = ggml_cuda_scale;
|
8162
8727
|
break;
|
@@ -8196,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8196
8761
|
case GGML_OP_IM2COL:
|
8197
8762
|
func = ggml_cuda_im2col;
|
8198
8763
|
break;
|
8764
|
+
case GGML_OP_SUM_ROWS:
|
8765
|
+
func = ggml_cuda_sum_rows;
|
8766
|
+
break;
|
8767
|
+
case GGML_OP_ARGSORT:
|
8768
|
+
func = ggml_cuda_argsort;
|
8769
|
+
break;
|
8199
8770
|
default:
|
8200
8771
|
return false;
|
8201
8772
|
}
|
@@ -8212,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8212
8783
|
|
8213
8784
|
int ggml_cuda_get_device_count() {
|
8214
8785
|
int device_count;
|
8215
|
-
|
8786
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
8787
|
+
return 0;
|
8788
|
+
}
|
8216
8789
|
return device_count;
|
8217
8790
|
}
|
8218
8791
|
|
@@ -8228,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8228
8801
|
|
8229
8802
|
#define UNUSED GGML_UNUSED
|
8230
8803
|
|
8231
|
-
|
8232
|
-
};
|
8233
|
-
|
8234
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8235
|
-
return GGML_CUDA_NAME;
|
8236
|
-
|
8237
|
-
UNUSED(backend);
|
8238
|
-
}
|
8239
|
-
|
8240
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8241
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8242
|
-
delete cuda_ctx;
|
8243
|
-
delete backend;
|
8244
|
-
}
|
8804
|
+
// cuda buffer
|
8245
8805
|
|
8246
8806
|
struct ggml_backend_buffer_context_cuda {
|
8247
|
-
|
8248
|
-
|
8807
|
+
int device;
|
8808
|
+
void * dev_ptr = nullptr;
|
8249
8809
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8250
8810
|
size_t temp_tensor_extra_index = 0;
|
8251
8811
|
|
8812
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
8813
|
+
|
8252
8814
|
~ggml_backend_buffer_context_cuda() {
|
8253
8815
|
delete[] temp_tensor_extras;
|
8254
8816
|
}
|
@@ -8269,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8269
8831
|
|
8270
8832
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8271
8833
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8272
|
-
CUDA_CHECK(cudaFree(ctx->
|
8834
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8273
8835
|
delete ctx;
|
8274
8836
|
}
|
8275
8837
|
|
8276
8838
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8277
8839
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8278
|
-
return ctx->
|
8279
|
-
}
|
8280
|
-
|
8281
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8282
|
-
int64_t row_low = 0;
|
8283
|
-
int64_t row_high = ggml_nrows(tensor);
|
8284
|
-
int64_t nrows_split = row_high - row_low;
|
8285
|
-
|
8286
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8287
|
-
|
8288
|
-
int64_t ne0 = tensor->ne[0];
|
8289
|
-
|
8290
|
-
if (ggml_is_quantized(tensor->type)) {
|
8291
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8292
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8293
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8294
|
-
}
|
8295
|
-
}
|
8296
|
-
|
8297
|
-
return size;
|
8298
|
-
|
8299
|
-
UNUSED(buffer);
|
8840
|
+
return ctx->dev_ptr;
|
8300
8841
|
}
|
8301
8842
|
|
8302
8843
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8303
8844
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8304
8845
|
|
8305
8846
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8306
|
-
assert(tensor->view_src->buffer->
|
8847
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8307
8848
|
tensor->backend = tensor->view_src->backend;
|
8308
8849
|
tensor->extra = tensor->view_src->extra;
|
8309
8850
|
return;
|
@@ -8311,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8311
8852
|
|
8312
8853
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8313
8854
|
|
8314
|
-
extra->data_device[
|
8855
|
+
extra->data_device[ctx->device] = tensor->data;
|
8315
8856
|
|
8316
8857
|
tensor->backend = GGML_BACKEND_GPU;
|
8317
8858
|
tensor->extra = extra;
|
@@ -8323,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8323
8864
|
int64_t nrows_split = row_high - row_low;
|
8324
8865
|
|
8325
8866
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8326
|
-
size_t padded_size =
|
8867
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8327
8868
|
|
8328
8869
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8329
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
8870
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8330
8871
|
}
|
8331
8872
|
}
|
8332
8873
|
|
8333
8874
|
UNUSED(buffer);
|
8334
8875
|
}
|
8335
8876
|
|
8877
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
8878
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8879
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8880
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8881
|
+
|
8882
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
8883
|
+
|
8884
|
+
UNUSED(buffer);
|
8885
|
+
}
|
8886
|
+
|
8887
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
8888
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8889
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8890
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
8893
|
+
|
8894
|
+
UNUSED(buffer);
|
8895
|
+
}
|
8896
|
+
|
8336
8897
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8337
|
-
/* .free_buffer
|
8338
|
-
/* .get_base
|
8339
|
-
/* .
|
8340
|
-
/* .
|
8341
|
-
/* .
|
8898
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
8899
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
8900
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
8901
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
8902
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
8903
|
+
/* .cpy_tensor_from = */ NULL,
|
8904
|
+
/* .cpy_tensor_to = */ NULL,
|
8342
8905
|
};
|
8343
8906
|
|
8344
|
-
|
8345
|
-
|
8907
|
+
// cuda buffer type
|
8908
|
+
|
8909
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8910
|
+
int device = (int) (intptr_t) buft->context;
|
8346
8911
|
|
8347
|
-
|
8912
|
+
ggml_cuda_set_device(device);
|
8348
8913
|
|
8349
8914
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8350
8915
|
|
8351
|
-
|
8352
|
-
CUDA_CHECK(cudaMalloc(&
|
8916
|
+
void * dev_ptr;
|
8917
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8918
|
+
|
8919
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
8353
8920
|
|
8354
|
-
return ggml_backend_buffer_init(
|
8921
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8355
8922
|
}
|
8356
8923
|
|
8357
|
-
static size_t
|
8924
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8358
8925
|
return 128;
|
8926
|
+
|
8927
|
+
UNUSED(buft);
|
8928
|
+
}
|
8929
|
+
|
8930
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
8931
|
+
int64_t row_low = 0;
|
8932
|
+
int64_t row_high = ggml_nrows(tensor);
|
8933
|
+
int64_t nrows_split = row_high - row_low;
|
8934
|
+
|
8935
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8936
|
+
|
8937
|
+
int64_t ne0 = tensor->ne[0];
|
8938
|
+
|
8939
|
+
if (ggml_is_quantized(tensor->type)) {
|
8940
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8943
|
+
}
|
8944
|
+
}
|
8945
|
+
|
8946
|
+
return size;
|
8947
|
+
|
8948
|
+
UNUSED(buft);
|
8949
|
+
}
|
8950
|
+
|
8951
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
8952
|
+
return ggml_backend_is_cuda(backend);
|
8953
|
+
|
8954
|
+
UNUSED(buft);
|
8955
|
+
}
|
8956
|
+
|
8957
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
8958
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
8959
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
8960
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
8961
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
8962
|
+
};
|
8963
|
+
|
8964
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
8965
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
8966
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
8967
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
8968
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
8969
|
+
ggml_backend_buffer_type_cuda[i] = {
|
8970
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
8971
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
8972
|
+
};
|
8973
|
+
}
|
8974
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
8975
|
+
}
|
8976
|
+
|
8977
|
+
return &ggml_backend_buffer_type_cuda[device];
|
8978
|
+
}
|
8979
|
+
|
8980
|
+
// host buffer type
|
8981
|
+
|
8982
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8983
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8984
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
8985
|
+
delete ctx;
|
8986
|
+
}
|
8987
|
+
|
8988
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8989
|
+
void * ptr;
|
8990
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
8991
|
+
|
8992
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
8993
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
8994
|
+
buffer->buft = buft;
|
8995
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
8996
|
+
|
8997
|
+
return buffer;
|
8998
|
+
|
8999
|
+
UNUSED(buft);
|
9000
|
+
}
|
9001
|
+
|
9002
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9003
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9004
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9005
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9006
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9007
|
+
};
|
9008
|
+
|
9009
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9010
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9011
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9012
|
+
/* .context = */ nullptr,
|
9013
|
+
};
|
9014
|
+
|
9015
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9016
|
+
}
|
9017
|
+
|
9018
|
+
// backend
|
9019
|
+
|
9020
|
+
struct ggml_backend_context_cuda {
|
9021
|
+
int device;
|
9022
|
+
};
|
9023
|
+
|
9024
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9025
|
+
return GGML_CUDA_NAME;
|
9026
|
+
|
8359
9027
|
UNUSED(backend);
|
8360
9028
|
}
|
8361
9029
|
|
9030
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9031
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9032
|
+
|
9033
|
+
delete cuda_ctx;
|
9034
|
+
delete backend;
|
9035
|
+
}
|
9036
|
+
|
9037
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9038
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9039
|
+
|
9040
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9041
|
+
}
|
9042
|
+
|
8362
9043
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9044
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9045
|
+
|
9046
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8363
9047
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8364
9048
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8365
9049
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8366
9050
|
|
8367
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8368
|
-
|
8369
|
-
UNUSED(backend);
|
9051
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8370
9052
|
}
|
8371
9053
|
|
8372
9054
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9055
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9056
|
+
|
9057
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8373
9058
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8374
9059
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8375
9060
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8376
9061
|
|
8377
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8378
|
-
|
8379
|
-
UNUSED(backend);
|
9062
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8380
9063
|
}
|
8381
9064
|
|
8382
9065
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8383
|
-
|
9066
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9067
|
+
|
9068
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8384
9069
|
|
8385
9070
|
UNUSED(backend);
|
8386
9071
|
}
|
@@ -8394,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8394
9079
|
UNUSED(cgraph);
|
8395
9080
|
}
|
8396
9081
|
|
8397
|
-
|
9082
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8398
9083
|
GGML_ASSERT(!"not implemented");
|
8399
9084
|
|
8400
9085
|
UNUSED(backend);
|
8401
9086
|
UNUSED(plan);
|
8402
9087
|
}
|
8403
9088
|
|
8404
|
-
|
9089
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8405
9090
|
GGML_ASSERT(!"not implemented");
|
8406
9091
|
|
8407
9092
|
UNUSED(backend);
|
@@ -8409,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8409
9094
|
}
|
8410
9095
|
|
8411
9096
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8412
|
-
|
9097
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9098
|
+
|
9099
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8413
9100
|
|
8414
9101
|
ggml_compute_params params = {};
|
8415
9102
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8417,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8417
9104
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8418
9105
|
ggml_tensor * node = cgraph->nodes[i];
|
8419
9106
|
|
8420
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9107
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8421
9108
|
continue;
|
8422
|
-
|
9109
|
+
|
8423
9110
|
assert(node->backend == GGML_BACKEND_GPU);
|
9111
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9112
|
+
assert(node->extra != nullptr);
|
9113
|
+
|
8424
9114
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8425
9115
|
if (node->src[j] != nullptr) {
|
8426
9116
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9117
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9118
|
+
assert(node->src[j]->extra != nullptr);
|
8427
9119
|
}
|
8428
9120
|
}
|
8429
9121
|
|
@@ -8460,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8460
9152
|
UNUSED(backend);
|
8461
9153
|
}
|
8462
9154
|
|
9155
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9156
|
+
switch (op->op) {
|
9157
|
+
case GGML_OP_UNARY:
|
9158
|
+
switch (ggml_get_unary_op(op)) {
|
9159
|
+
case GGML_UNARY_OP_GELU:
|
9160
|
+
case GGML_UNARY_OP_SILU:
|
9161
|
+
case GGML_UNARY_OP_RELU:
|
9162
|
+
return true;
|
9163
|
+
default:
|
9164
|
+
return false;
|
9165
|
+
}
|
9166
|
+
break;
|
9167
|
+
case GGML_OP_MUL_MAT:
|
9168
|
+
case GGML_OP_MUL_MAT_ID:
|
9169
|
+
{
|
9170
|
+
struct ggml_tensor * a;
|
9171
|
+
struct ggml_tensor * b;
|
9172
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9173
|
+
a = op->src[0];
|
9174
|
+
b = op->src[1];
|
9175
|
+
} else {
|
9176
|
+
a = op->src[2];
|
9177
|
+
b = op->src[1];
|
9178
|
+
}
|
9179
|
+
if (a->ne[3] != b->ne[3]) {
|
9180
|
+
return false;
|
9181
|
+
}
|
9182
|
+
return true;
|
9183
|
+
} break;
|
9184
|
+
case GGML_OP_NONE:
|
9185
|
+
case GGML_OP_RESHAPE:
|
9186
|
+
case GGML_OP_VIEW:
|
9187
|
+
case GGML_OP_PERMUTE:
|
9188
|
+
case GGML_OP_TRANSPOSE:
|
9189
|
+
case GGML_OP_NORM:
|
9190
|
+
case GGML_OP_REPEAT:
|
9191
|
+
case GGML_OP_GET_ROWS:
|
9192
|
+
case GGML_OP_DUP:
|
9193
|
+
case GGML_OP_ADD:
|
9194
|
+
case GGML_OP_MUL:
|
9195
|
+
case GGML_OP_DIV:
|
9196
|
+
case GGML_OP_RMS_NORM:
|
9197
|
+
case GGML_OP_SCALE:
|
9198
|
+
case GGML_OP_SQR:
|
9199
|
+
case GGML_OP_CLAMP:
|
9200
|
+
case GGML_OP_CPY:
|
9201
|
+
case GGML_OP_CONT:
|
9202
|
+
case GGML_OP_DIAG_MASK_INF:
|
9203
|
+
case GGML_OP_SOFT_MAX:
|
9204
|
+
case GGML_OP_ROPE:
|
9205
|
+
case GGML_OP_ALIBI:
|
9206
|
+
case GGML_OP_IM2COL:
|
9207
|
+
case GGML_OP_SUM_ROWS:
|
9208
|
+
case GGML_OP_ARGSORT:
|
9209
|
+
return true;
|
9210
|
+
default:
|
9211
|
+
return false;
|
9212
|
+
}
|
9213
|
+
|
9214
|
+
UNUSED(backend);
|
9215
|
+
}
|
9216
|
+
|
8463
9217
|
static ggml_backend_i cuda_backend_i = {
|
8464
|
-
/* .get_name
|
8465
|
-
/* .free
|
8466
|
-
/* .
|
8467
|
-
/* .
|
8468
|
-
/* .
|
8469
|
-
/* .
|
8470
|
-
/* .
|
8471
|
-
/* .
|
8472
|
-
/* .
|
8473
|
-
/* .
|
8474
|
-
/* .
|
8475
|
-
/* .
|
8476
|
-
/* .
|
8477
|
-
/* .supports_op = */ nullptr,
|
9218
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9219
|
+
/* .free = */ ggml_backend_cuda_free,
|
9220
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9221
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9222
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9223
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9224
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9225
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9226
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9227
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9228
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9229
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9230
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8478
9231
|
};
|
8479
9232
|
|
8480
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9233
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8481
9234
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8482
9235
|
|
8483
|
-
|
9236
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9237
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9238
|
+
return nullptr;
|
9239
|
+
}
|
9240
|
+
|
9241
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9242
|
+
ggml_cuda_set_main_device(device);
|
9243
|
+
|
9244
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9245
|
+
/* .device = */ device
|
9246
|
+
};
|
8484
9247
|
|
8485
9248
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8486
9249
|
/* .interface = */ cuda_backend_i,
|
@@ -8489,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8489
9252
|
|
8490
9253
|
return cuda_backend;
|
8491
9254
|
}
|
9255
|
+
|
9256
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9257
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9258
|
+
}
|
9259
|
+
|
9260
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9261
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9262
|
+
return cuda_backend;
|
9263
|
+
|
9264
|
+
UNUSED(params);
|
9265
|
+
}
|
9266
|
+
|
9267
|
+
extern "C" int ggml_backend_cuda_reg_devices() {
|
9268
|
+
int device_count = ggml_cuda_get_device_count();
|
9269
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
|
+
for (int i = 0; i < device_count; i++) {
|
9271
|
+
char name[128];
|
9272
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9273
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9274
|
+
}
|
9275
|
+
return device_count;
|
9276
|
+
}
|