llama_cpp 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
@@ -1,7 +1,8 @@
|
|
1
1
|
#include <algorithm>
|
2
|
-
#include <cinttypes>
|
3
2
|
#include <cstddef>
|
4
3
|
#include <cstdint>
|
4
|
+
#include <cinttypes>
|
5
|
+
#include <float.h>
|
5
6
|
#include <limits>
|
6
7
|
#include <stdint.h>
|
7
8
|
#include <stdio.h>
|
@@ -69,6 +70,7 @@
|
|
69
70
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
71
|
#define cudaSetDevice hipSetDevice
|
71
72
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
73
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
74
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
75
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
76
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
192
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
193
|
cudaGetErrorString(err_)); \
|
192
194
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
195
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
196
|
} \
|
195
197
|
} while (0)
|
196
198
|
|
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
206
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
207
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
208
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
209
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
210
|
} \
|
209
211
|
} while (0)
|
210
212
|
#else
|
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
218
|
cudaGetDevice(&id); \
|
217
219
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
220
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
221
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
222
|
} \
|
221
223
|
} while (0)
|
222
224
|
#endif // CUDART_VERSION >= 11
|
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
435
|
#define WARP_SIZE 32
|
434
436
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
437
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
440
|
#define CUDA_RELU_BLOCK_SIZE 256
|
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
443
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
444
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
445
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
446
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
447
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
448
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -501,40 +502,112 @@ static size_t g_scratch_offset = 0;
|
|
501
502
|
|
502
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
503
504
|
|
504
|
-
static
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
return;
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
509
|
}
|
510
|
-
|
510
|
+
return x;
|
511
511
|
}
|
512
512
|
|
513
|
-
static
|
514
|
-
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
515
521
|
|
516
|
-
|
517
|
-
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
518
526
|
}
|
519
|
-
|
527
|
+
return x;
|
520
528
|
}
|
521
529
|
|
522
|
-
static
|
523
|
-
|
530
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
531
|
+
return b;
|
532
|
+
}
|
524
533
|
|
525
|
-
|
534
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
535
|
+
return a + b;
|
536
|
+
}
|
537
|
+
|
538
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
539
|
+
return a * b;
|
540
|
+
}
|
541
|
+
|
542
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
543
|
+
return a / b;
|
544
|
+
}
|
545
|
+
|
546
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
547
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
548
|
+
int ne0, int ne1, int ne2, int ne3,
|
549
|
+
int ne10, int ne11, int ne12, int ne13,
|
550
|
+
/*int s0, */ int s1, int s2, int s3,
|
551
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
552
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
553
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
554
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
555
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
556
|
+
|
557
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
526
558
|
return;
|
527
559
|
}
|
528
|
-
|
560
|
+
|
561
|
+
const int i11 = i1 % ne11;
|
562
|
+
const int i12 = i2 % ne12;
|
563
|
+
const int i13 = i3 % ne13;
|
564
|
+
|
565
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
566
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
567
|
+
const size_t i_dst = i_src0;
|
568
|
+
|
569
|
+
const src0_t * src0_row = src0 + i_src0;
|
570
|
+
const src1_t * src1_row = src1 + i_src1;
|
571
|
+
dst_t * dst_row = dst + i_dst;
|
572
|
+
|
573
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
574
|
+
const int i10 = i0 % ne10;
|
575
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
576
|
+
}
|
529
577
|
}
|
530
578
|
|
531
|
-
|
579
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
580
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
581
|
+
int ne0, int ne1, int ne2, int ne3,
|
582
|
+
int ne10, int ne11, int ne12, int ne13,
|
583
|
+
/*int s0, */ int s1, int s2, int s3,
|
584
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
585
|
+
|
532
586
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
533
587
|
|
534
|
-
|
588
|
+
const int i3 = i/(ne2*ne1*ne0);
|
589
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
590
|
+
const int i1 = (i/ne0) % ne1;
|
591
|
+
const int i0 = i % ne0;
|
592
|
+
|
593
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
535
594
|
return;
|
536
595
|
}
|
537
|
-
|
596
|
+
|
597
|
+
const int i11 = i1 % ne11;
|
598
|
+
const int i12 = i2 % ne12;
|
599
|
+
const int i13 = i3 % ne13;
|
600
|
+
|
601
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
602
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
603
|
+
const size_t i_dst = i_src0;
|
604
|
+
|
605
|
+
const src0_t * src0_row = src0 + i_src0;
|
606
|
+
const src1_t * src1_row = src1 + i_src1;
|
607
|
+
dst_t * dst_row = dst + i_dst;
|
608
|
+
|
609
|
+
const int i10 = i0 % ne10;
|
610
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
538
611
|
}
|
539
612
|
|
540
613
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -577,22 +650,11 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
577
650
|
dst[i] = x[i] * x[i];
|
578
651
|
}
|
579
652
|
|
580
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
581
|
-
#pragma unroll
|
582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
583
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
584
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
585
|
-
}
|
586
|
-
return a;
|
587
|
-
}
|
588
|
-
|
589
653
|
template <int block_size>
|
590
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
654
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
591
655
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
592
656
|
const int tid = threadIdx.x;
|
593
657
|
|
594
|
-
const float eps = 1e-5f;
|
595
|
-
|
596
658
|
float2 mean_var = make_float2(0.f, 0.f);
|
597
659
|
|
598
660
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -624,14 +686,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
624
686
|
}
|
625
687
|
}
|
626
688
|
|
627
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
628
|
-
#pragma unroll
|
629
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
630
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
631
|
-
}
|
632
|
-
return x;
|
633
|
-
}
|
634
|
-
|
635
689
|
template <int block_size>
|
636
690
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
637
691
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -4550,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4550
4604
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4551
4605
|
}
|
4552
4606
|
|
4607
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4608
|
+
const float * xi = (const float *) cxi;
|
4609
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4610
|
+
|
4611
|
+
float amax = 0.0f; // absolute max
|
4612
|
+
|
4613
|
+
for (int j = 0; j < QK8_0; j++) {
|
4614
|
+
const float v = xi[j];
|
4615
|
+
amax = fmaxf(amax, fabsf(v));
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const float d = amax / ((1 << 7) - 1);
|
4619
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4620
|
+
|
4621
|
+
dsti->d = d;
|
4622
|
+
|
4623
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4624
|
+
const float x0 = xi[j]*id;
|
4625
|
+
|
4626
|
+
dsti->qs[j] = roundf(x0);
|
4627
|
+
}
|
4628
|
+
}
|
4629
|
+
|
4630
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4631
|
+
const float * xi = (const float *) cxi;
|
4632
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4633
|
+
|
4634
|
+
float amax = 0.0f;
|
4635
|
+
float vmax = 0.0f;
|
4636
|
+
|
4637
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4638
|
+
const float v = xi[j];
|
4639
|
+
if (amax < fabsf(v)) {
|
4640
|
+
amax = fabsf(v);
|
4641
|
+
vmax = v;
|
4642
|
+
}
|
4643
|
+
}
|
4644
|
+
|
4645
|
+
const float d = vmax / -8;
|
4646
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4647
|
+
|
4648
|
+
dsti->d = d;
|
4649
|
+
|
4650
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4651
|
+
const float x0 = xi[0 + j]*id;
|
4652
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4653
|
+
|
4654
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4655
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4656
|
+
|
4657
|
+
dsti->qs[j] = xi0;
|
4658
|
+
dsti->qs[j] |= xi1 << 4;
|
4659
|
+
}
|
4660
|
+
}
|
4661
|
+
|
4662
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4663
|
+
const float * xi = (const float *) cxi;
|
4664
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4665
|
+
|
4666
|
+
float vmin = FLT_MAX;
|
4667
|
+
float vmax = -FLT_MAX;
|
4668
|
+
|
4669
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4670
|
+
const float v = xi[j];
|
4671
|
+
|
4672
|
+
if (v < vmin) vmin = v;
|
4673
|
+
if (v > vmax) vmax = v;
|
4674
|
+
}
|
4675
|
+
|
4676
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4677
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4678
|
+
|
4679
|
+
dsti->dm.x = d;
|
4680
|
+
dsti->dm.y = vmin;
|
4681
|
+
|
4682
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4683
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4684
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4685
|
+
|
4686
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4687
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4688
|
+
|
4689
|
+
dsti->qs[j] = xi0;
|
4690
|
+
dsti->qs[j] |= xi1 << 4;
|
4691
|
+
}
|
4692
|
+
}
|
4693
|
+
|
4694
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4695
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4696
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4697
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4698
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4699
|
+
|
4700
|
+
if (i >= ne) {
|
4701
|
+
return;
|
4702
|
+
}
|
4703
|
+
|
4704
|
+
const int i02 = i / (ne00*ne01);
|
4705
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4706
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4707
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4708
|
+
|
4709
|
+
const int i12 = i / (ne10*ne11);
|
4710
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4711
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4712
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4713
|
+
|
4714
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4715
|
+
}
|
4716
|
+
|
4553
4717
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4554
4718
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4555
4719
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4610,8 +4774,8 @@ static __global__ void rope(
|
|
4610
4774
|
|
4611
4775
|
template<typename T, bool has_pos>
|
4612
4776
|
static __global__ void rope_neox(
|
4613
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4614
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4777
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4778
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4615
4779
|
) {
|
4616
4780
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4617
4781
|
|
@@ -4620,23 +4784,25 @@ static __global__ void rope_neox(
|
|
4620
4784
|
}
|
4621
4785
|
|
4622
4786
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4623
|
-
const int
|
4787
|
+
const int ib = col / n_dims;
|
4788
|
+
const int ic = col % n_dims;
|
4789
|
+
|
4790
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4624
4791
|
const int i2 = row/p_delta_rows;
|
4625
4792
|
|
4626
|
-
|
4627
|
-
const float cur_rot = -float(col)/ncols;
|
4793
|
+
float cur_rot = inv_ndims * ic - ib;
|
4628
4794
|
|
4629
4795
|
const int p = has_pos ? pos[i2] : 0;
|
4630
|
-
const float theta_base = p*powf(
|
4796
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4631
4797
|
|
4632
4798
|
float cos_theta, sin_theta;
|
4633
4799
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4634
4800
|
|
4635
4801
|
const float x0 = x[i + 0];
|
4636
|
-
const float x1 = x[i +
|
4802
|
+
const float x1 = x[i + n_dims/2];
|
4637
4803
|
|
4638
|
-
dst[i + 0]
|
4639
|
-
dst[i +
|
4804
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4805
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4640
4806
|
}
|
4641
4807
|
|
4642
4808
|
static __global__ void rope_glm_f32(
|
@@ -4702,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4702
4868
|
dst[i] = col * m_k + x[i];
|
4703
4869
|
}
|
4704
4870
|
|
4871
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
4872
|
+
const int row = blockIdx.y;
|
4873
|
+
const int col = threadIdx.x;
|
4874
|
+
|
4875
|
+
float sum = 0.0f;
|
4876
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
4877
|
+
sum += x[row * ncols + i];
|
4878
|
+
}
|
4879
|
+
|
4880
|
+
sum = warp_reduce_sum(sum);
|
4881
|
+
|
4882
|
+
if (col == 0) {
|
4883
|
+
dst[row] = sum;
|
4884
|
+
}
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
template<typename T>
|
4888
|
+
static inline __device__ void swap(T & a, T & b) {
|
4889
|
+
T tmp = a;
|
4890
|
+
a = b;
|
4891
|
+
b = tmp;
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
template<ggml_sort_order order>
|
4895
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
4896
|
+
// bitonic sort
|
4897
|
+
int col = threadIdx.x;
|
4898
|
+
int row = blockIdx.y;
|
4899
|
+
|
4900
|
+
if (col >= ncols) return;
|
4901
|
+
|
4902
|
+
const float * x_row = x + row * ncols;
|
4903
|
+
int * dst_row = dst + row * ncols;
|
4904
|
+
|
4905
|
+
// initialize indices
|
4906
|
+
if (col < ncols) {
|
4907
|
+
dst_row[col] = col;
|
4908
|
+
}
|
4909
|
+
__syncthreads();
|
4910
|
+
|
4911
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
4912
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
4913
|
+
int ixj = col ^ j;
|
4914
|
+
if (ixj > col) {
|
4915
|
+
if ((col & k) == 0) {
|
4916
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
4917
|
+
swap(dst_row[col], dst_row[ixj]);
|
4918
|
+
}
|
4919
|
+
} else {
|
4920
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
4921
|
+
swap(dst_row[col], dst_row[ixj]);
|
4922
|
+
}
|
4923
|
+
}
|
4924
|
+
}
|
4925
|
+
__syncthreads();
|
4926
|
+
}
|
4927
|
+
}
|
4928
|
+
}
|
4929
|
+
|
4705
4930
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4706
4931
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4707
4932
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4711,49 +4936,79 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4711
4936
|
}
|
4712
4937
|
|
4713
4938
|
const int i = row*ncols + col;
|
4714
|
-
//
|
4715
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4939
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
4940
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4941
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4716
4942
|
}
|
4717
4943
|
|
4718
|
-
|
4719
|
-
|
4720
|
-
|
4721
|
-
const int
|
4722
|
-
|
4723
|
-
const int
|
4944
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4945
|
+
const int tid = threadIdx.x;
|
4946
|
+
const int rowx = blockIdx.x;
|
4947
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4948
|
+
|
4949
|
+
const int block_size = blockDim.x;
|
4950
|
+
|
4951
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4952
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4953
|
+
|
4954
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4724
4955
|
|
4725
4956
|
float max_val = -INFINITY;
|
4726
4957
|
|
4727
4958
|
for (int col = tid; col < ncols; col += block_size) {
|
4728
|
-
const int
|
4729
|
-
|
4959
|
+
const int ix = rowx*ncols + col;
|
4960
|
+
const int iy = rowy*ncols + col;
|
4961
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4730
4962
|
}
|
4731
4963
|
|
4732
4964
|
// find the max value in the block
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4965
|
+
max_val = warp_reduce_max(max_val);
|
4966
|
+
if (block_size > WARP_SIZE) {
|
4967
|
+
if (warp_id == 0) {
|
4968
|
+
buf[lane_id] = -INFINITY;
|
4969
|
+
}
|
4970
|
+
__syncthreads();
|
4971
|
+
|
4972
|
+
if (lane_id == 0) {
|
4973
|
+
buf[warp_id] = max_val;
|
4974
|
+
}
|
4975
|
+
__syncthreads();
|
4976
|
+
|
4977
|
+
max_val = buf[lane_id];
|
4978
|
+
max_val = warp_reduce_max(max_val);
|
4736
4979
|
}
|
4737
4980
|
|
4738
4981
|
float tmp = 0.f;
|
4739
4982
|
|
4740
4983
|
for (int col = tid; col < ncols; col += block_size) {
|
4741
|
-
const int
|
4742
|
-
const
|
4984
|
+
const int ix = rowx*ncols + col;
|
4985
|
+
const int iy = rowy*ncols + col;
|
4986
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4743
4987
|
tmp += val;
|
4744
|
-
dst[
|
4988
|
+
dst[ix] = val;
|
4745
4989
|
}
|
4746
4990
|
|
4747
|
-
// sum
|
4748
|
-
|
4749
|
-
|
4750
|
-
|
4991
|
+
// find the sum of exps in the block
|
4992
|
+
tmp = warp_reduce_sum(tmp);
|
4993
|
+
if (block_size > WARP_SIZE) {
|
4994
|
+
if (warp_id == 0) {
|
4995
|
+
buf[lane_id] = 0.f;
|
4996
|
+
}
|
4997
|
+
__syncthreads();
|
4998
|
+
|
4999
|
+
if (lane_id == 0) {
|
5000
|
+
buf[warp_id] = tmp;
|
5001
|
+
}
|
5002
|
+
__syncthreads();
|
5003
|
+
|
5004
|
+
tmp = buf[lane_id];
|
5005
|
+
tmp = warp_reduce_sum(tmp);
|
4751
5006
|
}
|
4752
5007
|
|
4753
5008
|
const float inv_tmp = 1.f / tmp;
|
4754
5009
|
|
4755
5010
|
for (int col = tid; col < ncols; col += block_size) {
|
4756
|
-
const int i =
|
5011
|
+
const int i = rowx*ncols + col;
|
4757
5012
|
dst[i] *= inv_tmp;
|
4758
5013
|
}
|
4759
5014
|
}
|
@@ -4805,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
|
|
4805
5060
|
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4806
5061
|
}
|
4807
5062
|
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
|
4812
|
-
|
4813
|
-
|
4814
|
-
|
4815
|
-
|
4816
|
-
|
4817
|
-
|
4818
|
-
|
4819
|
-
|
4820
|
-
|
4821
|
-
|
5063
|
+
template<float (*bin_op)(const float, const float)>
|
5064
|
+
struct bin_bcast_cuda {
|
5065
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5066
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5067
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5068
|
+
cudaStream_t stream) {
|
5069
|
+
|
5070
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
|
+
|
5072
|
+
|
5073
|
+
int nr0 = ne10/ne0;
|
5074
|
+
int nr1 = ne11/ne1;
|
5075
|
+
int nr2 = ne12/ne2;
|
5076
|
+
int nr3 = ne13/ne3;
|
5077
|
+
|
5078
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5079
|
+
|
5080
|
+
// collapse dimensions until first broadcast dimension
|
5081
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5082
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5083
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5084
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5085
|
+
auto collapse = [](int64_t cne[]) {
|
5086
|
+
cne[0] *= cne[1];
|
5087
|
+
cne[1] = cne[2];
|
5088
|
+
cne[2] = cne[3];
|
5089
|
+
cne[3] = 1;
|
5090
|
+
};
|
5091
|
+
|
5092
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5093
|
+
cnb[1] *= cne[1];
|
5094
|
+
cnb[2] *= cne[2];
|
5095
|
+
cnb[3] *= cne[3];
|
5096
|
+
};
|
5097
|
+
|
5098
|
+
for (int i = 0; i < 4; i++) {
|
5099
|
+
if (nr[i] != 1) {
|
5100
|
+
break;
|
5101
|
+
}
|
5102
|
+
if (i > 0) {
|
5103
|
+
collapse_nb(cnb0, cne0);
|
5104
|
+
collapse_nb(cnb1, cne1);
|
5105
|
+
collapse(cne0);
|
5106
|
+
collapse(cne1);
|
5107
|
+
}
|
5108
|
+
}
|
5109
|
+
{
|
5110
|
+
int64_t ne0 = cne0[0];
|
5111
|
+
int64_t ne1 = cne0[1];
|
5112
|
+
int64_t ne2 = cne0[2];
|
5113
|
+
int64_t ne3 = cne0[3];
|
5114
|
+
|
5115
|
+
int64_t ne10 = cne1[0];
|
5116
|
+
int64_t ne11 = cne1[1];
|
5117
|
+
int64_t ne12 = cne1[2];
|
5118
|
+
int64_t ne13 = cne1[3];
|
5119
|
+
|
5120
|
+
//size_t nb0 = cnb0[0];
|
5121
|
+
size_t nb1 = cnb0[1];
|
5122
|
+
size_t nb2 = cnb0[2];
|
5123
|
+
size_t nb3 = cnb0[3];
|
5124
|
+
|
5125
|
+
//size_t nb10 = cnb1[0];
|
5126
|
+
size_t nb11 = cnb1[1];
|
5127
|
+
size_t nb12 = cnb1[2];
|
5128
|
+
size_t nb13 = cnb1[3];
|
5129
|
+
|
5130
|
+
//size_t s0 = nb0 / sizeof(src1_t);
|
5131
|
+
size_t s1 = nb1 / sizeof(src1_t);
|
5132
|
+
size_t s2 = nb2 / sizeof(src1_t);
|
5133
|
+
size_t s3 = nb3 / sizeof(src1_t);
|
5134
|
+
|
5135
|
+
//size_t s10 = nb10 / sizeof(src1_t);
|
5136
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5137
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5138
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5139
|
+
|
5140
|
+
|
5141
|
+
const int block_size = 128;
|
5142
|
+
|
5143
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5144
|
+
|
5145
|
+
dim3 block_dims;
|
5146
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5147
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5148
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5149
|
+
|
5150
|
+
dim3 block_nums(
|
5151
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5152
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5153
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5154
|
+
);
|
4822
5155
|
|
4823
|
-
|
4824
|
-
|
4825
|
-
|
4826
|
-
|
5156
|
+
if (block_nums.z > 65535) {
|
5157
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5158
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5159
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5160
|
+
src0_dd, src1_dd, dst_dd,
|
5161
|
+
ne0, ne1, ne2, ne3,
|
5162
|
+
ne10, ne11, ne12, ne13,
|
5163
|
+
/* s0, */ s1, s2, s3,
|
5164
|
+
/* s10, */ s11, s12, s13);
|
5165
|
+
} else {
|
5166
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5167
|
+
src0_dd, src1_dd, dst_dd,
|
5168
|
+
ne0, ne1, ne2, ne3,
|
5169
|
+
ne10, ne11, ne12, ne13,
|
5170
|
+
/* s0, */ s1, s2, s3,
|
5171
|
+
/* s10, */ s11, s12, s13);
|
5172
|
+
}
|
5173
|
+
}
|
5174
|
+
}
|
5175
|
+
};
|
4827
5176
|
|
4828
5177
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4829
5178
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
@@ -4845,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
4845
5194
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4846
5195
|
}
|
4847
5196
|
|
4848
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5197
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4849
5198
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4850
5199
|
if (ncols < 1024) {
|
4851
5200
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4852
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5201
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4853
5202
|
} else {
|
4854
5203
|
const dim3 block_dims(1024, 1, 1);
|
4855
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5204
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4856
5205
|
}
|
4857
5206
|
}
|
4858
5207
|
|
@@ -4874,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4874
5223
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4875
5224
|
}
|
4876
5225
|
|
4877
|
-
template<typename dst_t>
|
4878
|
-
static void
|
4879
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4880
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4881
|
-
}
|
4882
|
-
|
4883
|
-
template<typename dst_t>
|
4884
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4885
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4886
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4887
|
-
}
|
4888
|
-
|
4889
|
-
template<typename dst_t>
|
4890
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4891
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4892
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4893
|
-
}
|
4894
|
-
|
4895
|
-
template<typename dst_t>
|
4896
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4897
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4898
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4899
|
-
}
|
4900
|
-
|
4901
|
-
template<typename dst_t>
|
4902
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5226
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5227
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4903
5228
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4904
|
-
dequantize_block<
|
5229
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4905
5230
|
}
|
4906
5231
|
|
4907
5232
|
template<typename dst_t>
|
@@ -4950,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4950
5275
|
#endif
|
4951
5276
|
}
|
4952
5277
|
|
5278
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5279
|
+
switch (type) {
|
5280
|
+
case GGML_TYPE_Q4_0:
|
5281
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5282
|
+
case GGML_TYPE_Q4_1:
|
5283
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5284
|
+
case GGML_TYPE_Q5_0:
|
5285
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5286
|
+
case GGML_TYPE_Q5_1:
|
5287
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5288
|
+
case GGML_TYPE_Q8_0:
|
5289
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5290
|
+
case GGML_TYPE_Q2_K:
|
5291
|
+
return dequantize_row_q2_K_cuda;
|
5292
|
+
case GGML_TYPE_Q3_K:
|
5293
|
+
return dequantize_row_q3_K_cuda;
|
5294
|
+
case GGML_TYPE_Q4_K:
|
5295
|
+
return dequantize_row_q4_K_cuda;
|
5296
|
+
case GGML_TYPE_Q5_K:
|
5297
|
+
return dequantize_row_q5_K_cuda;
|
5298
|
+
case GGML_TYPE_Q6_K:
|
5299
|
+
return dequantize_row_q6_K_cuda;
|
5300
|
+
case GGML_TYPE_F32:
|
5301
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5302
|
+
default:
|
5303
|
+
return nullptr;
|
5304
|
+
}
|
5305
|
+
}
|
5306
|
+
|
5307
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5308
|
+
switch (type) {
|
5309
|
+
case GGML_TYPE_Q4_0:
|
5310
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5311
|
+
case GGML_TYPE_Q4_1:
|
5312
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5313
|
+
case GGML_TYPE_Q5_0:
|
5314
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5315
|
+
case GGML_TYPE_Q5_1:
|
5316
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5317
|
+
case GGML_TYPE_Q8_0:
|
5318
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5319
|
+
case GGML_TYPE_Q2_K:
|
5320
|
+
return dequantize_row_q2_K_cuda;
|
5321
|
+
case GGML_TYPE_Q3_K:
|
5322
|
+
return dequantize_row_q3_K_cuda;
|
5323
|
+
case GGML_TYPE_Q4_K:
|
5324
|
+
return dequantize_row_q4_K_cuda;
|
5325
|
+
case GGML_TYPE_Q5_K:
|
5326
|
+
return dequantize_row_q5_K_cuda;
|
5327
|
+
case GGML_TYPE_Q6_K:
|
5328
|
+
return dequantize_row_q6_K_cuda;
|
5329
|
+
case GGML_TYPE_F16:
|
5330
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5331
|
+
default:
|
5332
|
+
return nullptr;
|
5333
|
+
}
|
5334
|
+
}
|
5335
|
+
|
4953
5336
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4954
5337
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4955
5338
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5038,13 +5421,22 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5038
5421
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5039
5422
|
}
|
5040
5423
|
|
5041
|
-
static void
|
5042
|
-
GGML_ASSERT(ncols %
|
5424
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5425
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5043
5426
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5044
5427
|
const dim3 block_nums(block_num_y, 1, 1);
|
5045
5428
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5046
|
-
|
5047
|
-
<<<block_nums, block_dims, 0, stream>>>(vx,
|
5429
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5430
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5431
|
+
}
|
5432
|
+
|
5433
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5434
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
5435
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5436
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5437
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5438
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
5439
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5048
5440
|
}
|
5049
5441
|
|
5050
5442
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
@@ -5128,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5128
5520
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5129
5521
|
}
|
5130
5522
|
|
5131
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5132
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5133
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5134
|
-
}
|
5135
|
-
|
5136
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5137
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5138
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5139
|
-
}
|
5140
|
-
|
5141
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5142
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5143
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5144
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5145
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5146
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5147
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5148
|
-
}
|
5149
|
-
|
5150
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5151
|
-
switch (type) {
|
5152
|
-
case GGML_TYPE_Q4_0:
|
5153
|
-
return dequantize_row_q4_0_cuda;
|
5154
|
-
case GGML_TYPE_Q4_1:
|
5155
|
-
return dequantize_row_q4_1_cuda;
|
5156
|
-
case GGML_TYPE_Q5_0:
|
5157
|
-
return dequantize_row_q5_0_cuda;
|
5158
|
-
case GGML_TYPE_Q5_1:
|
5159
|
-
return dequantize_row_q5_1_cuda;
|
5160
|
-
case GGML_TYPE_Q8_0:
|
5161
|
-
return dequantize_row_q8_0_cuda;
|
5162
|
-
case GGML_TYPE_Q2_K:
|
5163
|
-
return dequantize_row_q2_K_cuda;
|
5164
|
-
case GGML_TYPE_Q3_K:
|
5165
|
-
return dequantize_row_q3_K_cuda;
|
5166
|
-
case GGML_TYPE_Q4_K:
|
5167
|
-
return dequantize_row_q4_K_cuda;
|
5168
|
-
case GGML_TYPE_Q5_K:
|
5169
|
-
return dequantize_row_q5_K_cuda;
|
5170
|
-
case GGML_TYPE_Q6_K:
|
5171
|
-
return dequantize_row_q6_K_cuda;
|
5172
|
-
case GGML_TYPE_F32:
|
5173
|
-
return convert_fp32_to_fp16_cuda;
|
5174
|
-
default:
|
5175
|
-
return nullptr;
|
5176
|
-
}
|
5177
|
-
}
|
5178
|
-
|
5179
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5180
|
-
switch (type) {
|
5181
|
-
case GGML_TYPE_Q4_0:
|
5182
|
-
return dequantize_row_q4_0_cuda;
|
5183
|
-
case GGML_TYPE_Q4_1:
|
5184
|
-
return dequantize_row_q4_1_cuda;
|
5185
|
-
case GGML_TYPE_Q5_0:
|
5186
|
-
return dequantize_row_q5_0_cuda;
|
5187
|
-
case GGML_TYPE_Q5_1:
|
5188
|
-
return dequantize_row_q5_1_cuda;
|
5189
|
-
case GGML_TYPE_Q8_0:
|
5190
|
-
return dequantize_row_q8_0_cuda;
|
5191
|
-
case GGML_TYPE_Q2_K:
|
5192
|
-
return dequantize_row_q2_K_cuda;
|
5193
|
-
case GGML_TYPE_Q3_K:
|
5194
|
-
return dequantize_row_q3_K_cuda;
|
5195
|
-
case GGML_TYPE_Q4_K:
|
5196
|
-
return dequantize_row_q4_K_cuda;
|
5197
|
-
case GGML_TYPE_Q5_K:
|
5198
|
-
return dequantize_row_q5_K_cuda;
|
5199
|
-
case GGML_TYPE_Q6_K:
|
5200
|
-
return dequantize_row_q6_K_cuda;
|
5201
|
-
case GGML_TYPE_F16:
|
5202
|
-
return convert_fp16_to_fp32_cuda;
|
5203
|
-
default:
|
5204
|
-
return nullptr;
|
5205
|
-
}
|
5206
|
-
}
|
5207
|
-
|
5208
5523
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5209
5524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5210
5525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5697,6 +6012,39 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5697
6012
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5698
6013
|
}
|
5699
6014
|
|
6015
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
6016
|
+
const char * cx, char * cdst, const int ne,
|
6017
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6018
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6019
|
+
|
6020
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6021
|
+
const int num_blocks = ne / QK8_0;
|
6022
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
6023
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6024
|
+
}
|
6025
|
+
|
6026
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6027
|
+
const char * cx, char * cdst, const int ne,
|
6028
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6029
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6030
|
+
|
6031
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6032
|
+
const int num_blocks = ne / QK4_0;
|
6033
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6034
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6035
|
+
}
|
6036
|
+
|
6037
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6038
|
+
const char * cx, char * cdst, const int ne,
|
6039
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6040
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6041
|
+
|
6042
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6043
|
+
const int num_blocks = ne / QK4_1;
|
6044
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6045
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6046
|
+
}
|
6047
|
+
|
5700
6048
|
static void ggml_cpy_f16_f16_cuda(
|
5701
6049
|
const char * cx, char * cdst, const int ne,
|
5702
6050
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -5739,20 +6087,26 @@ static void rope_cuda(
|
|
5739
6087
|
|
5740
6088
|
template<typename T>
|
5741
6089
|
static void rope_neox_cuda(
|
5742
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
6090
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5743
6091
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5744
6092
|
) {
|
5745
6093
|
GGML_ASSERT(ncols % 2 == 0);
|
5746
6094
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5747
6095
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5748
6096
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
6097
|
+
|
6098
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6099
|
+
const float inv_ndims = -1.0f / n_dims;
|
6100
|
+
|
5749
6101
|
if (pos == nullptr) {
|
5750
6102
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5751
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
6103
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
6104
|
+
theta_scale, inv_ndims
|
5752
6105
|
);
|
5753
6106
|
} else {
|
5754
6107
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5755
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
6108
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
6109
|
+
theta_scale, inv_ndims
|
5756
6110
|
);
|
5757
6111
|
}
|
5758
6112
|
}
|
@@ -5777,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5777
6131
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5778
6132
|
}
|
5779
6133
|
|
6134
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6135
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6136
|
+
const dim3 block_nums(1, nrows, 1);
|
6137
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6138
|
+
}
|
6139
|
+
|
6140
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6141
|
+
// bitonic sort requires ncols to be power of 2
|
6142
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6143
|
+
|
6144
|
+
const dim3 block_dims(ncols, 1, 1);
|
6145
|
+
const dim3 block_nums(1, nrows, 1);
|
6146
|
+
if (order == GGML_SORT_ASC) {
|
6147
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6148
|
+
} else if (order == GGML_SORT_DESC) {
|
6149
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6150
|
+
} else {
|
6151
|
+
GGML_ASSERT(false);
|
6152
|
+
}
|
6153
|
+
}
|
6154
|
+
|
5780
6155
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5781
6156
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5782
6157
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5784,10 +6159,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5784
6159
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5785
6160
|
}
|
5786
6161
|
|
5787
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5788
|
-
|
6162
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
6163
|
+
int nth = WARP_SIZE;
|
6164
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
6165
|
+
const dim3 block_dims(nth, 1, 1);
|
5789
6166
|
const dim3 block_nums(nrows_x, 1, 1);
|
5790
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
6167
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5791
6168
|
}
|
5792
6169
|
|
5793
6170
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -5867,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5867
6244
|
return ptr;
|
5868
6245
|
}
|
5869
6246
|
#ifdef DEBUG_CUDA_MALLOC
|
5870
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6247
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5871
6248
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5872
6249
|
#endif
|
5873
6250
|
void * ptr;
|
@@ -6005,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6005
6382
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6006
6383
|
// This can fixed the OOM error in WSL.
|
6007
6384
|
cudaGetLastError();
|
6008
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6385
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6009
6386
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6010
6387
|
return nullptr;
|
6011
6388
|
}
|
@@ -6050,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6050
6427
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6051
6428
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6052
6429
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6053
|
-
}
|
6054
|
-
if (nb0 == ts) {
|
6430
|
+
} else if (nb0 == ts) {
|
6055
6431
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6056
|
-
}
|
6057
|
-
|
6058
|
-
|
6059
|
-
|
6060
|
-
|
6061
|
-
|
6062
|
-
|
6063
|
-
}
|
6064
|
-
return cudaSuccess;
|
6065
|
-
}
|
6066
|
-
|
6067
|
-
static void ggml_cuda_op_repeat(
|
6068
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6069
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6070
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6071
|
-
const int64_t ne0 = dst->ne[0];
|
6072
|
-
const int64_t ne1 = dst->ne[1];
|
6073
|
-
const int64_t ne2 = dst->ne[2];
|
6074
|
-
const int64_t ne3 = dst->ne[3];
|
6075
|
-
|
6076
|
-
const int64_t ne00 = src0->ne[0];
|
6077
|
-
const int64_t ne01 = src0->ne[1];
|
6078
|
-
const int64_t ne02 = src0->ne[2];
|
6079
|
-
const int64_t ne03 = src0->ne[3];
|
6080
|
-
|
6081
|
-
const size_t nb0 = dst->nb[0];
|
6082
|
-
const size_t nb1 = dst->nb[1];
|
6083
|
-
const size_t nb2 = dst->nb[2];
|
6084
|
-
const size_t nb3 = dst->nb[3];
|
6085
|
-
|
6086
|
-
const size_t nb00 = src0->nb[0];
|
6087
|
-
const size_t nb01 = src0->nb[1];
|
6088
|
-
const size_t nb02 = src0->nb[2];
|
6089
|
-
const size_t nb03 = src0->nb[3];
|
6090
|
-
|
6091
|
-
const int nr0 = (int)(ne0/ne00);
|
6092
|
-
const int nr1 = (int)(ne1/ne01);
|
6093
|
-
const int nr2 = (int)(ne2/ne02);
|
6094
|
-
const int nr3 = (int)(ne3/ne03);
|
6095
|
-
|
6096
|
-
// TODO: support for transposed / permuted tensors
|
6097
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6098
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6099
|
-
|
6100
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6101
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6102
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6103
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6104
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6105
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6106
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6107
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6108
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6109
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6110
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6111
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6112
|
-
}
|
6113
|
-
}
|
6114
|
-
}
|
6115
|
-
}
|
6116
|
-
}
|
6432
|
+
} else {
|
6433
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6434
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6435
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6436
|
+
// pretend the row is a matrix with cols=1
|
6437
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6438
|
+
if (r != cudaSuccess) return r;
|
6117
6439
|
}
|
6440
|
+
return cudaSuccess;
|
6118
6441
|
}
|
6119
|
-
|
6120
|
-
(void) src1;
|
6121
|
-
(void) src1_d;
|
6122
6442
|
}
|
6123
6443
|
|
6124
6444
|
static void ggml_cuda_op_get_rows(
|
@@ -6165,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
|
|
6165
6485
|
}
|
6166
6486
|
}
|
6167
6487
|
|
6168
|
-
|
6488
|
+
template<class op>
|
6489
|
+
inline void ggml_cuda_op_bin_bcast(
|
6169
6490
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6170
6491
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6171
6492
|
|
6172
6493
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6173
6494
|
|
6174
|
-
const int64_t ne10 = src1->ne[0];
|
6175
|
-
const int64_t ne11 = src1->ne[1];
|
6176
|
-
|
6177
6495
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6178
|
-
|
6496
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6179
6497
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6180
|
-
|
6498
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6181
6499
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6182
|
-
|
6500
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6183
6501
|
} else {
|
6184
|
-
fprintf(stderr, "src0
|
6502
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6503
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6185
6504
|
GGML_ASSERT(false);
|
6186
6505
|
}
|
6506
|
+
}
|
6507
|
+
|
6508
|
+
static void ggml_cuda_op_repeat(
|
6509
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6510
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6511
|
+
|
6512
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6187
6513
|
|
6188
6514
|
(void) src1;
|
6189
|
-
(void)
|
6515
|
+
(void) src1_d;
|
6190
6516
|
}
|
6191
6517
|
|
6192
|
-
inline void
|
6518
|
+
inline void ggml_cuda_op_add(
|
6193
6519
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6194
6520
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6195
6521
|
|
6196
|
-
|
6197
|
-
|
6198
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6522
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
|
+
}
|
6199
6524
|
|
6200
|
-
|
6201
|
-
const
|
6525
|
+
inline void ggml_cuda_op_mul(
|
6526
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6202
6528
|
|
6203
|
-
|
6529
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6530
|
+
}
|
6204
6531
|
|
6205
|
-
|
6532
|
+
inline void ggml_cuda_op_div(
|
6533
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6534
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6535
|
+
|
6536
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6206
6537
|
}
|
6207
6538
|
|
6208
6539
|
inline void ggml_cuda_op_gelu(
|
@@ -6271,7 +6602,10 @@ inline void ggml_cuda_op_norm(
|
|
6271
6602
|
const int64_t ne00 = src0->ne[0];
|
6272
6603
|
const int64_t nrows = ggml_nrows(src0);
|
6273
6604
|
|
6274
|
-
|
6605
|
+
float eps;
|
6606
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
6607
|
+
|
6608
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6275
6609
|
|
6276
6610
|
(void) src1;
|
6277
6611
|
(void) dst;
|
@@ -6426,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6426
6760
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6427
6761
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6428
6762
|
|
6763
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
6764
|
+
|
6429
6765
|
const int64_t ne00 = src0->ne[0];
|
6430
6766
|
const int64_t row_diff = row_high - row_low;
|
6431
6767
|
|
@@ -6485,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6485
6821
|
size_t ash;
|
6486
6822
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6487
6823
|
|
6488
|
-
bool src1_convert_f16 =
|
6824
|
+
bool src1_convert_f16 =
|
6825
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6489
6826
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6490
6827
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6491
6828
|
|
@@ -6707,15 +7044,14 @@ inline void ggml_cuda_op_rope(
|
|
6707
7044
|
GGML_ASSERT(false);
|
6708
7045
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6709
7046
|
} else if (is_neox) {
|
6710
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6711
7047
|
if (src0->type == GGML_TYPE_F32) {
|
6712
7048
|
rope_neox_cuda(
|
6713
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
7049
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6714
7050
|
attn_factor, corr_dims, main_stream
|
6715
7051
|
);
|
6716
7052
|
} else if (src0->type == GGML_TYPE_F16) {
|
6717
7053
|
rope_neox_cuda(
|
6718
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
7054
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6719
7055
|
attn_factor, corr_dims, main_stream
|
6720
7056
|
);
|
6721
7057
|
} else {
|
@@ -6812,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
|
|
6812
7148
|
(void) src0_dd;
|
6813
7149
|
}
|
6814
7150
|
|
7151
|
+
inline void ggml_cuda_op_sum_rows(
|
7152
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7154
|
+
|
7155
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7156
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7157
|
+
|
7158
|
+
const int64_t ncols = src0->ne[0];
|
7159
|
+
const int64_t nrows = ggml_nrows(src0);
|
7160
|
+
|
7161
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7162
|
+
|
7163
|
+
(void) src1;
|
7164
|
+
(void) dst;
|
7165
|
+
(void) src1_dd;
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
inline void ggml_cuda_op_argsort(
|
7169
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7170
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7171
|
+
|
7172
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7173
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7174
|
+
|
7175
|
+
const int64_t ncols = src0->ne[0];
|
7176
|
+
const int64_t nrows = ggml_nrows(src0);
|
7177
|
+
|
7178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7179
|
+
|
7180
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7181
|
+
|
7182
|
+
(void) src1;
|
7183
|
+
(void) dst;
|
7184
|
+
(void) src1_dd;
|
7185
|
+
}
|
7186
|
+
|
6815
7187
|
inline void ggml_cuda_op_diag_mask_inf(
|
6816
7188
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6817
7189
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6839,14 +7211,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6839
7211
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6840
7212
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6841
7213
|
|
7214
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
7215
|
+
|
6842
7216
|
const int64_t ne00 = src0->ne[0];
|
6843
|
-
const int64_t
|
7217
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
7218
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6844
7219
|
|
6845
|
-
|
7220
|
+
float scale = 1.0f;
|
7221
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
7222
|
+
|
7223
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6846
7224
|
|
6847
|
-
(void) src1;
|
6848
7225
|
(void) dst;
|
6849
|
-
(void) src1_dd;
|
6850
7226
|
}
|
6851
7227
|
|
6852
7228
|
inline void ggml_cuda_op_scale(
|
@@ -7016,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7016
7392
|
const int64_t ne01 = src0->ne[1];
|
7017
7393
|
const int64_t ne02 = src0->ne[2];
|
7018
7394
|
const int64_t ne03 = src0->ne[3];
|
7019
|
-
|
7395
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7020
7396
|
|
7021
7397
|
const int64_t ne10 = src1->ne[0];
|
7022
7398
|
const int64_t ne11 = src1->ne[1];
|
@@ -7052,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7052
7428
|
|
7053
7429
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7054
7430
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7055
|
-
|
7056
7431
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7057
|
-
|
7058
|
-
|
7432
|
+
|
7433
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7059
7434
|
|
7060
7435
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7061
7436
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7180,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7180
7555
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7181
7556
|
|
7182
7557
|
// for split tensors the data begins at i0 == i0_offset_low
|
7183
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
7558
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7184
7559
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7185
7560
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7186
7561
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7325,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7325
7700
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7326
7701
|
}
|
7327
7702
|
|
7703
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7704
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
7705
|
+
}
|
7706
|
+
|
7328
7707
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7329
7708
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7330
7709
|
}
|
@@ -7350,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7350
7729
|
}
|
7351
7730
|
|
7352
7731
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7353
|
-
if (!g_cublas_loaded)
|
7732
|
+
if (!g_cublas_loaded) return false;
|
7354
7733
|
|
7355
7734
|
const int64_t ne10 = src1->ne[0];
|
7356
7735
|
|
@@ -7428,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7428
7807
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7429
7808
|
}
|
7430
7809
|
|
7431
|
-
__global__
|
7810
|
+
static __global__ void k_compute_batched_ptrs(
|
7432
7811
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7433
7812
|
const void ** ptrs_src, void ** ptrs_dst,
|
7434
7813
|
int ne12, int ne13,
|
@@ -7484,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7484
7863
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7485
7864
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7486
7865
|
|
7487
|
-
|
7488
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7489
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7866
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7490
7867
|
|
7491
7868
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7492
7869
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7543,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7543
7920
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7544
7921
|
// use cublasGemmStridedBatchedEx
|
7545
7922
|
CUBLAS_CHECK(
|
7546
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
7923
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7547
7924
|
ne01, ne11, ne10,
|
7548
7925
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7549
7926
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7577,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7577
7954
|
CUDA_CHECK(cudaGetLastError());
|
7578
7955
|
|
7579
7956
|
CUBLAS_CHECK(
|
7580
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
7957
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7581
7958
|
ne01, ne11, ne10,
|
7582
7959
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7583
7960
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7647,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7647
8024
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7648
8025
|
const bool use_mul_mat_vec_q = false;
|
7649
8026
|
#else
|
7650
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8027
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7651
8028
|
#endif // GGML_CUDA_FORCE_DMMV
|
7652
8029
|
|
7653
8030
|
if (use_mul_mat_vec_q) {
|
8031
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7654
8032
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7655
8033
|
} else {
|
7656
8034
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7675,42 +8053,255 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7675
8053
|
}
|
7676
8054
|
}
|
7677
8055
|
|
7678
|
-
|
7679
|
-
|
7680
|
-
|
8056
|
+
#if 0
|
8057
|
+
template<typename ... Srcs>
|
8058
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8059
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8060
|
+
int ne12, int ne13,
|
8061
|
+
int ne23,
|
8062
|
+
int nb02, int nb03,
|
8063
|
+
int nb12, int nb13,
|
8064
|
+
int nb2, int nb3,
|
8065
|
+
int r2, int r3,
|
8066
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8067
|
+
const half * src1_f16, half * dst_f16,
|
8068
|
+
const int32_t * ids, const int id,
|
8069
|
+
Srcs... src0s) {
|
8070
|
+
|
8071
|
+
int i = ids[id];
|
8072
|
+
|
8073
|
+
half * src0_f16;
|
8074
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8075
|
+
if (src0_type == GGML_TYPE_F16) {
|
8076
|
+
src0_f16 = (half *) srcs_ar[i];
|
8077
|
+
} else {
|
8078
|
+
src0_f16 = src0_as_f16;
|
8079
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8080
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8081
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8082
|
+
}
|
8083
|
+
}
|
7681
8084
|
|
7682
|
-
|
7683
|
-
|
8085
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8086
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8087
|
+
|
8088
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8089
|
+
return;
|
8090
|
+
}
|
8091
|
+
|
8092
|
+
int i03 = i13 / r3;
|
8093
|
+
int i02 = i12 / r2;
|
8094
|
+
|
8095
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8096
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8097
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7684
8098
|
}
|
7685
8099
|
|
7686
|
-
static void
|
7687
|
-
const
|
7688
|
-
|
8100
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8101
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8102
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8103
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
7689
8104
|
|
7690
|
-
|
7691
|
-
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8105
|
+
const int id = dst->op_params[0];
|
7692
8106
|
|
7693
|
-
GGML_ASSERT(
|
7694
|
-
GGML_ASSERT(
|
8107
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8108
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7695
8109
|
|
7696
|
-
|
7697
|
-
|
7698
|
-
GGML_ASSERT(src0->ne[3] == 1);
|
8110
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8111
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7699
8112
|
|
7700
|
-
const int64_t
|
7701
|
-
const int64_t
|
7702
|
-
const int64_t
|
8113
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8114
|
+
const int64_t ne01 = src00->ne[1];
|
8115
|
+
const int64_t ne02 = src00->ne[2];
|
8116
|
+
const int64_t ne03 = src00->ne[3];
|
8117
|
+
|
8118
|
+
//const int64_t nb01 = src00->nb[1];
|
8119
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8120
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
7703
8121
|
|
7704
8122
|
const int64_t ne10 = src1->ne[0];
|
7705
8123
|
const int64_t ne11 = src1->ne[1];
|
7706
|
-
|
8124
|
+
const int64_t ne12 = src1->ne[2];
|
8125
|
+
const int64_t ne13 = src1->ne[3];
|
7707
8126
|
|
7708
|
-
const int64_t
|
7709
|
-
const int64_t
|
7710
|
-
const int64_t
|
8127
|
+
//const int64_t nb11 = src1->nb[1];
|
8128
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8129
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7711
8130
|
|
7712
|
-
|
7713
|
-
|
8131
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8132
|
+
const int64_t ne = ggml_nelements(dst);
|
8133
|
+
|
8134
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8135
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8136
|
+
|
8137
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8138
|
+
|
8139
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8140
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8141
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8142
|
+
|
8143
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8144
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8145
|
+
|
8146
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8147
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8148
|
+
|
8149
|
+
// convert src1 to fp16
|
8150
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8151
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8152
|
+
|
8153
|
+
size_t src1_as = 0;
|
8154
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8155
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8156
|
+
|
8157
|
+
size_t dst_as = 0;
|
8158
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8159
|
+
|
8160
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8161
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8162
|
+
|
8163
|
+
// broadcast factors
|
8164
|
+
const int64_t r2 = ne12/ne02;
|
8165
|
+
const int64_t r3 = ne13/ne03;
|
8166
|
+
|
8167
|
+
const half alpha_f16 = 1.0f;
|
8168
|
+
const half beta_f16 = 0.0f;
|
8169
|
+
|
8170
|
+
// use cublasGemmBatchedEx
|
8171
|
+
const int ne23 = ne12*ne13;
|
8172
|
+
|
8173
|
+
const void ** ptrs_src = nullptr;
|
8174
|
+
void ** ptrs_dst = nullptr;
|
8175
|
+
|
8176
|
+
size_t ptrs_src_s = 0;
|
8177
|
+
size_t ptrs_dst_s = 0;
|
8178
|
+
|
8179
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8180
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8181
|
+
|
8182
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8183
|
+
half * src0_as_f16 = nullptr;
|
8184
|
+
size_t src0_as = 0;
|
8185
|
+
if (src00->type != GGML_TYPE_F16) {
|
8186
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8187
|
+
}
|
8188
|
+
|
8189
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8190
|
+
dim3 block_dims(ne13, ne12);
|
8191
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8192
|
+
ptrs_src, ptrs_dst,
|
8193
|
+
ne12, ne13,
|
8194
|
+
ne23,
|
8195
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8196
|
+
nb12, nb13,
|
8197
|
+
dst->nb[2], dst->nb[3],
|
8198
|
+
r2, r3,
|
8199
|
+
src00->type, src0_as_f16, src0_ne,
|
8200
|
+
src1_as_f16, dst_f16,
|
8201
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8202
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8203
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8204
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8205
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8206
|
+
);
|
8207
|
+
CUDA_CHECK(cudaGetLastError());
|
8208
|
+
|
8209
|
+
CUBLAS_CHECK(
|
8210
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8211
|
+
ne01, ne11, ne10,
|
8212
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8213
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8214
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8215
|
+
ne23,
|
8216
|
+
CUBLAS_COMPUTE_16F,
|
8217
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8218
|
+
|
8219
|
+
if (src0_as != 0) {
|
8220
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8221
|
+
}
|
8222
|
+
if (ptrs_src_s != 0) {
|
8223
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8224
|
+
}
|
8225
|
+
if (ptrs_dst_s != 0) {
|
8226
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8227
|
+
}
|
8228
|
+
|
8229
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8230
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8231
|
+
|
8232
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8233
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8234
|
+
}
|
8235
|
+
#endif
|
8236
|
+
|
8237
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
|
8238
|
+
#if 0
|
8239
|
+
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
+
// const bool use_tensor_cores = true;
|
8241
|
+
//#else
|
8242
|
+
// const bool use_tensor_cores = false;
|
8243
|
+
//#endif
|
8244
|
+
|
8245
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
+
|
8247
|
+
// TODO: mmq/mmv support
|
8248
|
+
#else
|
8249
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
+
const int id = dst->op_params[0];
|
8252
|
+
|
8253
|
+
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8254
|
+
|
8255
|
+
int32_t a_id;
|
8256
|
+
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8257
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8258
|
+
|
8259
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
8260
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8261
|
+
|
8262
|
+
ggml_cuda_mul_mat(src0, src1, dst);
|
8263
|
+
#endif
|
8264
|
+
|
8265
|
+
(void) _src0;
|
8266
|
+
(void) _src1;
|
8267
|
+
}
|
8268
|
+
|
8269
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8270
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
8271
|
+
}
|
8272
|
+
|
8273
|
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8274
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
8275
|
+
}
|
8276
|
+
|
8277
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8278
|
+
const int64_t ne = ggml_nelements(src0);
|
8279
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
8280
|
+
|
8281
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
8282
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8283
|
+
|
8284
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
8285
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
8286
|
+
|
8287
|
+
const int64_t ne00 = src0->ne[0];
|
8288
|
+
const int64_t ne01 = src0->ne[1];
|
8289
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
8290
|
+
|
8291
|
+
const int64_t nb00 = src0->nb[0];
|
8292
|
+
const int64_t nb01 = src0->nb[1];
|
8293
|
+
const int64_t nb02 = src0->nb[2];
|
8294
|
+
|
8295
|
+
const int64_t ne10 = src1->ne[0];
|
8296
|
+
const int64_t ne11 = src1->ne[1];
|
8297
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
8298
|
+
|
8299
|
+
const int64_t nb10 = src1->nb[0];
|
8300
|
+
const int64_t nb11 = src1->nb[1];
|
8301
|
+
const int64_t nb12 = src1->nb[2];
|
8302
|
+
|
8303
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8304
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7714
8305
|
|
7715
8306
|
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7716
8307
|
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -7719,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7719
8310
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7720
8311
|
|
7721
8312
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7722
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7723
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8313
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7724
8314
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7725
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7726
|
-
|
8315
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8316
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8317
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8318
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8319
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8320
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8321
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7727
8322
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7728
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7729
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8323
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7730
8324
|
} else {
|
7731
8325
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7732
8326
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7737,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7737
8331
|
}
|
7738
8332
|
|
7739
8333
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8334
|
+
// TODO: why do we pass dst as src1 here?
|
7740
8335
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7741
8336
|
(void) src1;
|
7742
8337
|
}
|
@@ -7762,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7762
8357
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7763
8358
|
}
|
7764
8359
|
|
8360
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8361
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8362
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8363
|
+
}
|
8364
|
+
|
8365
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8366
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8367
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8368
|
+
}
|
8369
|
+
|
7765
8370
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7766
8371
|
(void) src0;
|
7767
8372
|
(void) src1;
|
@@ -8017,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8017
8622
|
main_device, g_device_count, g_main_device);
|
8018
8623
|
return;
|
8019
8624
|
}
|
8020
|
-
|
8021
|
-
if (g_device_count > 1) {
|
8625
|
+
|
8626
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
8627
|
+
g_main_device = main_device;
|
8022
8628
|
cudaDeviceProp prop;
|
8023
8629
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8024
8630
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8044,7 +8650,7 @@ void ggml_cuda_free_scratch() {
|
|
8044
8650
|
}
|
8045
8651
|
|
8046
8652
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8047
|
-
if (!g_cublas_loaded)
|
8653
|
+
if (!g_cublas_loaded) return false;
|
8048
8654
|
|
8049
8655
|
ggml_cuda_func_t func;
|
8050
8656
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8080,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8080
8686
|
case GGML_OP_MUL:
|
8081
8687
|
func = ggml_cuda_mul;
|
8082
8688
|
break;
|
8689
|
+
case GGML_OP_DIV:
|
8690
|
+
func = ggml_cuda_div;
|
8691
|
+
break;
|
8083
8692
|
case GGML_OP_UNARY:
|
8084
8693
|
switch (ggml_get_unary_op(tensor)) {
|
8085
8694
|
case GGML_UNARY_OP_GELU:
|
@@ -8093,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8093
8702
|
break;
|
8094
8703
|
default:
|
8095
8704
|
return false;
|
8096
|
-
}
|
8705
|
+
}
|
8706
|
+
break;
|
8097
8707
|
case GGML_OP_NORM:
|
8098
8708
|
func = ggml_cuda_norm;
|
8099
8709
|
break;
|
@@ -8106,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8106
8716
|
}
|
8107
8717
|
func = ggml_cuda_mul_mat;
|
8108
8718
|
break;
|
8719
|
+
case GGML_OP_MUL_MAT_ID:
|
8720
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
8721
|
+
return false;
|
8722
|
+
}
|
8723
|
+
func = ggml_cuda_mul_mat_id;
|
8724
|
+
break;
|
8109
8725
|
case GGML_OP_SCALE:
|
8110
8726
|
func = ggml_cuda_scale;
|
8111
8727
|
break;
|
@@ -8145,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8145
8761
|
case GGML_OP_IM2COL:
|
8146
8762
|
func = ggml_cuda_im2col;
|
8147
8763
|
break;
|
8764
|
+
case GGML_OP_SUM_ROWS:
|
8765
|
+
func = ggml_cuda_sum_rows;
|
8766
|
+
break;
|
8767
|
+
case GGML_OP_ARGSORT:
|
8768
|
+
func = ggml_cuda_argsort;
|
8769
|
+
break;
|
8148
8770
|
default:
|
8149
8771
|
return false;
|
8150
8772
|
}
|
@@ -8161,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8161
8783
|
|
8162
8784
|
int ggml_cuda_get_device_count() {
|
8163
8785
|
int device_count;
|
8164
|
-
|
8786
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
8787
|
+
return 0;
|
8788
|
+
}
|
8165
8789
|
return device_count;
|
8166
8790
|
}
|
8167
8791
|
|
@@ -8177,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8177
8801
|
|
8178
8802
|
#define UNUSED GGML_UNUSED
|
8179
8803
|
|
8180
|
-
|
8181
|
-
};
|
8182
|
-
|
8183
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8184
|
-
return GGML_CUDA_NAME;
|
8185
|
-
|
8186
|
-
UNUSED(backend);
|
8187
|
-
}
|
8188
|
-
|
8189
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8190
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8191
|
-
delete cuda_ctx;
|
8192
|
-
delete backend;
|
8193
|
-
}
|
8804
|
+
// cuda buffer
|
8194
8805
|
|
8195
8806
|
struct ggml_backend_buffer_context_cuda {
|
8196
|
-
|
8197
|
-
|
8807
|
+
int device;
|
8808
|
+
void * dev_ptr = nullptr;
|
8198
8809
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8199
8810
|
size_t temp_tensor_extra_index = 0;
|
8200
8811
|
|
8812
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
8813
|
+
|
8201
8814
|
~ggml_backend_buffer_context_cuda() {
|
8202
8815
|
delete[] temp_tensor_extras;
|
8203
8816
|
}
|
@@ -8218,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8218
8831
|
|
8219
8832
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8220
8833
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8221
|
-
CUDA_CHECK(cudaFree(ctx->
|
8834
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8222
8835
|
delete ctx;
|
8223
8836
|
}
|
8224
8837
|
|
8225
8838
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8226
8839
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8227
|
-
return ctx->
|
8228
|
-
}
|
8229
|
-
|
8230
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8231
|
-
int64_t row_low = 0;
|
8232
|
-
int64_t row_high = ggml_nrows(tensor);
|
8233
|
-
int64_t nrows_split = row_high - row_low;
|
8234
|
-
|
8235
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8236
|
-
|
8237
|
-
int64_t ne0 = tensor->ne[0];
|
8238
|
-
|
8239
|
-
if (ggml_is_quantized(tensor->type)) {
|
8240
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8241
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8242
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8243
|
-
}
|
8244
|
-
}
|
8245
|
-
|
8246
|
-
return size;
|
8247
|
-
|
8248
|
-
UNUSED(buffer);
|
8840
|
+
return ctx->dev_ptr;
|
8249
8841
|
}
|
8250
8842
|
|
8251
8843
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8252
8844
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8253
8845
|
|
8254
8846
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8255
|
-
assert(tensor->view_src->buffer->
|
8847
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8256
8848
|
tensor->backend = tensor->view_src->backend;
|
8257
8849
|
tensor->extra = tensor->view_src->extra;
|
8258
8850
|
return;
|
@@ -8260,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8260
8852
|
|
8261
8853
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8262
8854
|
|
8263
|
-
extra->data_device[
|
8855
|
+
extra->data_device[ctx->device] = tensor->data;
|
8264
8856
|
|
8265
8857
|
tensor->backend = GGML_BACKEND_GPU;
|
8266
8858
|
tensor->extra = extra;
|
@@ -8272,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8272
8864
|
int64_t nrows_split = row_high - row_low;
|
8273
8865
|
|
8274
8866
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8275
|
-
size_t padded_size =
|
8867
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8276
8868
|
|
8277
8869
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8278
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
8870
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8279
8871
|
}
|
8280
8872
|
}
|
8281
8873
|
|
8282
8874
|
UNUSED(buffer);
|
8283
8875
|
}
|
8284
8876
|
|
8877
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
8878
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8879
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8880
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8881
|
+
|
8882
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
8883
|
+
|
8884
|
+
UNUSED(buffer);
|
8885
|
+
}
|
8886
|
+
|
8887
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
8888
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8889
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8890
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
8893
|
+
|
8894
|
+
UNUSED(buffer);
|
8895
|
+
}
|
8896
|
+
|
8285
8897
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8286
|
-
/* .free_buffer
|
8287
|
-
/* .get_base
|
8288
|
-
/* .
|
8289
|
-
/* .
|
8290
|
-
/* .
|
8898
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
8899
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
8900
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
8901
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
8902
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
8903
|
+
/* .cpy_tensor_from = */ NULL,
|
8904
|
+
/* .cpy_tensor_to = */ NULL,
|
8291
8905
|
};
|
8292
8906
|
|
8293
|
-
|
8294
|
-
ggml_cuda_set_device(g_main_device);
|
8907
|
+
// cuda buffer type
|
8295
8908
|
|
8296
|
-
|
8909
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8910
|
+
int device = (int) (intptr_t) buft->context;
|
8911
|
+
|
8912
|
+
ggml_cuda_set_device(device);
|
8297
8913
|
|
8298
8914
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8299
8915
|
|
8300
|
-
|
8301
|
-
CUDA_CHECK(cudaMalloc(&
|
8916
|
+
void * dev_ptr;
|
8917
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8302
8918
|
|
8303
|
-
|
8919
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
8920
|
+
|
8921
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8304
8922
|
}
|
8305
8923
|
|
8306
|
-
static size_t
|
8924
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8307
8925
|
return 128;
|
8926
|
+
|
8927
|
+
UNUSED(buft);
|
8928
|
+
}
|
8929
|
+
|
8930
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
8931
|
+
int64_t row_low = 0;
|
8932
|
+
int64_t row_high = ggml_nrows(tensor);
|
8933
|
+
int64_t nrows_split = row_high - row_low;
|
8934
|
+
|
8935
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8936
|
+
|
8937
|
+
int64_t ne0 = tensor->ne[0];
|
8938
|
+
|
8939
|
+
if (ggml_is_quantized(tensor->type)) {
|
8940
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8943
|
+
}
|
8944
|
+
}
|
8945
|
+
|
8946
|
+
return size;
|
8947
|
+
|
8948
|
+
UNUSED(buft);
|
8949
|
+
}
|
8950
|
+
|
8951
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
8952
|
+
return ggml_backend_is_cuda(backend);
|
8953
|
+
|
8954
|
+
UNUSED(buft);
|
8955
|
+
}
|
8956
|
+
|
8957
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
8958
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
8959
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
8960
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
8961
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
8962
|
+
};
|
8963
|
+
|
8964
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
8965
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
8966
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
8967
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
8968
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
8969
|
+
ggml_backend_buffer_type_cuda[i] = {
|
8970
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
8971
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
8972
|
+
};
|
8973
|
+
}
|
8974
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
8975
|
+
}
|
8976
|
+
|
8977
|
+
return &ggml_backend_buffer_type_cuda[device];
|
8978
|
+
}
|
8979
|
+
|
8980
|
+
// host buffer type
|
8981
|
+
|
8982
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8983
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8984
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
8985
|
+
delete ctx;
|
8986
|
+
}
|
8987
|
+
|
8988
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8989
|
+
void * ptr;
|
8990
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
8991
|
+
|
8992
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
8993
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
8994
|
+
buffer->buft = buft;
|
8995
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
8996
|
+
|
8997
|
+
return buffer;
|
8998
|
+
|
8999
|
+
UNUSED(buft);
|
9000
|
+
}
|
9001
|
+
|
9002
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9003
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9004
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9005
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9006
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9007
|
+
};
|
9008
|
+
|
9009
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9010
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9011
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9012
|
+
/* .context = */ nullptr,
|
9013
|
+
};
|
9014
|
+
|
9015
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9016
|
+
}
|
9017
|
+
|
9018
|
+
// backend
|
9019
|
+
|
9020
|
+
struct ggml_backend_context_cuda {
|
9021
|
+
int device;
|
9022
|
+
};
|
9023
|
+
|
9024
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9025
|
+
return GGML_CUDA_NAME;
|
9026
|
+
|
8308
9027
|
UNUSED(backend);
|
8309
9028
|
}
|
8310
9029
|
|
9030
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9031
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9032
|
+
|
9033
|
+
delete cuda_ctx;
|
9034
|
+
delete backend;
|
9035
|
+
}
|
9036
|
+
|
9037
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9038
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9039
|
+
|
9040
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9041
|
+
}
|
9042
|
+
|
8311
9043
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9044
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9045
|
+
|
9046
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8312
9047
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8313
9048
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8314
9049
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8315
9050
|
|
8316
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8317
|
-
|
8318
|
-
UNUSED(backend);
|
9051
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8319
9052
|
}
|
8320
9053
|
|
8321
9054
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9055
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9056
|
+
|
9057
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8322
9058
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8323
9059
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8324
9060
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8325
9061
|
|
8326
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8327
|
-
|
8328
|
-
UNUSED(backend);
|
9062
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8329
9063
|
}
|
8330
9064
|
|
8331
9065
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8332
|
-
|
9066
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9067
|
+
|
9068
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8333
9069
|
|
8334
9070
|
UNUSED(backend);
|
8335
9071
|
}
|
@@ -8343,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8343
9079
|
UNUSED(cgraph);
|
8344
9080
|
}
|
8345
9081
|
|
8346
|
-
|
9082
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8347
9083
|
GGML_ASSERT(!"not implemented");
|
8348
9084
|
|
8349
9085
|
UNUSED(backend);
|
8350
9086
|
UNUSED(plan);
|
8351
9087
|
}
|
8352
9088
|
|
8353
|
-
|
9089
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8354
9090
|
GGML_ASSERT(!"not implemented");
|
8355
9091
|
|
8356
9092
|
UNUSED(backend);
|
@@ -8358,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8358
9094
|
}
|
8359
9095
|
|
8360
9096
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8361
|
-
|
9097
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9098
|
+
|
9099
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8362
9100
|
|
8363
9101
|
ggml_compute_params params = {};
|
8364
9102
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8366,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8366
9104
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8367
9105
|
ggml_tensor * node = cgraph->nodes[i];
|
8368
9106
|
|
8369
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9107
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8370
9108
|
continue;
|
8371
|
-
|
9109
|
+
|
8372
9110
|
assert(node->backend == GGML_BACKEND_GPU);
|
9111
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9112
|
+
assert(node->extra != nullptr);
|
9113
|
+
|
8373
9114
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8374
9115
|
if (node->src[j] != nullptr) {
|
8375
9116
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9117
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9118
|
+
assert(node->src[j]->extra != nullptr);
|
8376
9119
|
}
|
8377
9120
|
}
|
8378
9121
|
|
@@ -8409,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8409
9152
|
UNUSED(backend);
|
8410
9153
|
}
|
8411
9154
|
|
9155
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9156
|
+
switch (op->op) {
|
9157
|
+
case GGML_OP_UNARY:
|
9158
|
+
switch (ggml_get_unary_op(op)) {
|
9159
|
+
case GGML_UNARY_OP_GELU:
|
9160
|
+
case GGML_UNARY_OP_SILU:
|
9161
|
+
case GGML_UNARY_OP_RELU:
|
9162
|
+
return true;
|
9163
|
+
default:
|
9164
|
+
return false;
|
9165
|
+
}
|
9166
|
+
break;
|
9167
|
+
case GGML_OP_MUL_MAT:
|
9168
|
+
case GGML_OP_MUL_MAT_ID:
|
9169
|
+
{
|
9170
|
+
struct ggml_tensor * a;
|
9171
|
+
struct ggml_tensor * b;
|
9172
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9173
|
+
a = op->src[0];
|
9174
|
+
b = op->src[1];
|
9175
|
+
} else {
|
9176
|
+
a = op->src[2];
|
9177
|
+
b = op->src[1];
|
9178
|
+
}
|
9179
|
+
if (a->ne[3] != b->ne[3]) {
|
9180
|
+
return false;
|
9181
|
+
}
|
9182
|
+
return true;
|
9183
|
+
} break;
|
9184
|
+
case GGML_OP_NONE:
|
9185
|
+
case GGML_OP_RESHAPE:
|
9186
|
+
case GGML_OP_VIEW:
|
9187
|
+
case GGML_OP_PERMUTE:
|
9188
|
+
case GGML_OP_TRANSPOSE:
|
9189
|
+
case GGML_OP_NORM:
|
9190
|
+
case GGML_OP_REPEAT:
|
9191
|
+
case GGML_OP_GET_ROWS:
|
9192
|
+
case GGML_OP_DUP:
|
9193
|
+
case GGML_OP_ADD:
|
9194
|
+
case GGML_OP_MUL:
|
9195
|
+
case GGML_OP_DIV:
|
9196
|
+
case GGML_OP_RMS_NORM:
|
9197
|
+
case GGML_OP_SCALE:
|
9198
|
+
case GGML_OP_SQR:
|
9199
|
+
case GGML_OP_CLAMP:
|
9200
|
+
case GGML_OP_CPY:
|
9201
|
+
case GGML_OP_CONT:
|
9202
|
+
case GGML_OP_DIAG_MASK_INF:
|
9203
|
+
case GGML_OP_SOFT_MAX:
|
9204
|
+
case GGML_OP_ROPE:
|
9205
|
+
case GGML_OP_ALIBI:
|
9206
|
+
case GGML_OP_IM2COL:
|
9207
|
+
case GGML_OP_SUM_ROWS:
|
9208
|
+
case GGML_OP_ARGSORT:
|
9209
|
+
return true;
|
9210
|
+
default:
|
9211
|
+
return false;
|
9212
|
+
}
|
9213
|
+
|
9214
|
+
UNUSED(backend);
|
9215
|
+
}
|
9216
|
+
|
8412
9217
|
static ggml_backend_i cuda_backend_i = {
|
8413
|
-
/* .get_name
|
8414
|
-
/* .free
|
8415
|
-
/* .
|
8416
|
-
/* .
|
8417
|
-
/* .
|
8418
|
-
/* .
|
8419
|
-
/* .
|
8420
|
-
/* .
|
8421
|
-
/* .
|
8422
|
-
/* .
|
8423
|
-
/* .
|
8424
|
-
/* .
|
8425
|
-
/* .
|
8426
|
-
/* .supports_op = */ nullptr,
|
9218
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9219
|
+
/* .free = */ ggml_backend_cuda_free,
|
9220
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9221
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9222
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9223
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9224
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9225
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9226
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9227
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9228
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9229
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9230
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8427
9231
|
};
|
8428
9232
|
|
8429
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9233
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8430
9234
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8431
9235
|
|
8432
|
-
|
9236
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9237
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9238
|
+
return nullptr;
|
9239
|
+
}
|
9240
|
+
|
9241
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9242
|
+
ggml_cuda_set_main_device(device);
|
9243
|
+
|
9244
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9245
|
+
/* .device = */ device
|
9246
|
+
};
|
8433
9247
|
|
8434
9248
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8435
9249
|
/* .interface = */ cuda_backend_i,
|
@@ -8438,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8438
9252
|
|
8439
9253
|
return cuda_backend;
|
8440
9254
|
}
|
9255
|
+
|
9256
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9257
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9258
|
+
}
|
9259
|
+
|
9260
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9261
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9262
|
+
return cuda_backend;
|
9263
|
+
|
9264
|
+
UNUSED(params);
|
9265
|
+
}
|
9266
|
+
|
9267
|
+
extern "C" int ggml_backend_cuda_reg_devices() {
|
9268
|
+
int device_count = ggml_cuda_get_device_count();
|
9269
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
|
+
for (int i = 0; i < device_count; i++) {
|
9271
|
+
char name[128];
|
9272
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9273
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9274
|
+
}
|
9275
|
+
return device_count;
|
9276
|
+
}
|