llama_cpp 0.9.4 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
@@ -1,7 +1,8 @@
|
|
1
1
|
#include <algorithm>
|
2
|
-
#include <cinttypes>
|
3
2
|
#include <cstddef>
|
4
3
|
#include <cstdint>
|
4
|
+
#include <cinttypes>
|
5
|
+
#include <float.h>
|
5
6
|
#include <limits>
|
6
7
|
#include <stdint.h>
|
7
8
|
#include <stdio.h>
|
@@ -69,6 +70,7 @@
|
|
69
70
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
70
71
|
#define cudaSetDevice hipSetDevice
|
71
72
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
73
|
+
#define cudaStreamFireAndForget hipStreamFireAndForget
|
72
74
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
73
75
|
#define cudaStreamSynchronize hipStreamSynchronize
|
74
76
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
192
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
191
193
|
cudaGetErrorString(err_)); \
|
192
194
|
fprintf(stderr, "current device: %d\n", id); \
|
193
|
-
|
195
|
+
GGML_ASSERT(!"CUDA error"); \
|
194
196
|
} \
|
195
197
|
} while (0)
|
196
198
|
|
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
204
206
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
205
207
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
206
208
|
fprintf(stderr, "current device: %d\n", id); \
|
207
|
-
|
209
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
208
210
|
} \
|
209
211
|
} while (0)
|
210
212
|
#else
|
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
216
218
|
cudaGetDevice(&id); \
|
217
219
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
218
220
|
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
|
221
|
+
GGML_ASSERT(!"cuBLAS error"); \
|
220
222
|
} \
|
221
223
|
} while (0)
|
222
224
|
#endif // CUDART_VERSION >= 11
|
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
435
|
#define WARP_SIZE 32
|
434
436
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
435
437
|
|
436
|
-
#define CUDA_ADD_BLOCK_SIZE 256
|
437
|
-
#define CUDA_MUL_BLOCK_SIZE 256
|
438
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
439
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
440
|
#define CUDA_RELU_BLOCK_SIZE 256
|
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
443
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
444
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
445
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
446
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
447
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
448
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -501,40 +502,112 @@ static size_t g_scratch_offset = 0;
|
|
501
502
|
|
502
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
503
504
|
|
504
|
-
static
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
return;
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
509
|
}
|
510
|
-
|
510
|
+
return x;
|
511
511
|
}
|
512
512
|
|
513
|
-
static
|
514
|
-
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
515
521
|
|
516
|
-
|
517
|
-
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
518
526
|
}
|
519
|
-
|
527
|
+
return x;
|
520
528
|
}
|
521
529
|
|
522
|
-
static
|
523
|
-
|
530
|
+
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
531
|
+
return b;
|
532
|
+
}
|
524
533
|
|
525
|
-
|
534
|
+
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
535
|
+
return a + b;
|
536
|
+
}
|
537
|
+
|
538
|
+
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
539
|
+
return a * b;
|
540
|
+
}
|
541
|
+
|
542
|
+
static __device__ __forceinline__ float op_div(const float a, const float b) {
|
543
|
+
return a / b;
|
544
|
+
}
|
545
|
+
|
546
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
547
|
+
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
548
|
+
int ne0, int ne1, int ne2, int ne3,
|
549
|
+
int ne10, int ne11, int ne12, int ne13,
|
550
|
+
/*int s0, */ int s1, int s2, int s3,
|
551
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
552
|
+
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
|
553
|
+
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
|
554
|
+
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
|
555
|
+
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
|
556
|
+
|
557
|
+
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
526
558
|
return;
|
527
559
|
}
|
528
|
-
|
560
|
+
|
561
|
+
const int i11 = i1 % ne11;
|
562
|
+
const int i12 = i2 % ne12;
|
563
|
+
const int i13 = i3 % ne13;
|
564
|
+
|
565
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
566
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
567
|
+
const size_t i_dst = i_src0;
|
568
|
+
|
569
|
+
const src0_t * src0_row = src0 + i_src0;
|
570
|
+
const src1_t * src1_row = src1 + i_src1;
|
571
|
+
dst_t * dst_row = dst + i_dst;
|
572
|
+
|
573
|
+
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
|
574
|
+
const int i10 = i0 % ne10;
|
575
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
576
|
+
}
|
529
577
|
}
|
530
578
|
|
531
|
-
|
579
|
+
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
580
|
+
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
581
|
+
int ne0, int ne1, int ne2, int ne3,
|
582
|
+
int ne10, int ne11, int ne12, int ne13,
|
583
|
+
/*int s0, */ int s1, int s2, int s3,
|
584
|
+
/*int s10,*/ int s11, int s12, int s13) {
|
585
|
+
|
532
586
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
533
587
|
|
534
|
-
|
588
|
+
const int i3 = i/(ne2*ne1*ne0);
|
589
|
+
const int i2 = (i/(ne1*ne0)) % ne2;
|
590
|
+
const int i1 = (i/ne0) % ne1;
|
591
|
+
const int i0 = i % ne0;
|
592
|
+
|
593
|
+
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
535
594
|
return;
|
536
595
|
}
|
537
|
-
|
596
|
+
|
597
|
+
const int i11 = i1 % ne11;
|
598
|
+
const int i12 = i2 % ne12;
|
599
|
+
const int i13 = i3 % ne13;
|
600
|
+
|
601
|
+
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
602
|
+
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
603
|
+
const size_t i_dst = i_src0;
|
604
|
+
|
605
|
+
const src0_t * src0_row = src0 + i_src0;
|
606
|
+
const src1_t * src1_row = src1 + i_src1;
|
607
|
+
dst_t * dst_row = dst + i_dst;
|
608
|
+
|
609
|
+
const int i10 = i0 % ne10;
|
610
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
538
611
|
}
|
539
612
|
|
540
613
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
@@ -577,22 +650,11 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
577
650
|
dst[i] = x[i] * x[i];
|
578
651
|
}
|
579
652
|
|
580
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
581
|
-
#pragma unroll
|
582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
583
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
584
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
585
|
-
}
|
586
|
-
return a;
|
587
|
-
}
|
588
|
-
|
589
653
|
template <int block_size>
|
590
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
654
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
591
655
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
592
656
|
const int tid = threadIdx.x;
|
593
657
|
|
594
|
-
const float eps = 1e-5f;
|
595
|
-
|
596
658
|
float2 mean_var = make_float2(0.f, 0.f);
|
597
659
|
|
598
660
|
for (int col = tid; col < ncols; col += block_size) {
|
@@ -624,14 +686,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
624
686
|
}
|
625
687
|
}
|
626
688
|
|
627
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
628
|
-
#pragma unroll
|
629
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
630
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
631
|
-
}
|
632
|
-
return x;
|
633
|
-
}
|
634
|
-
|
635
689
|
template <int block_size>
|
636
690
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
637
691
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -4550,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4550
4604
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4551
4605
|
}
|
4552
4606
|
|
4607
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
4608
|
+
const float * xi = (const float *) cxi;
|
4609
|
+
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
4610
|
+
|
4611
|
+
float amax = 0.0f; // absolute max
|
4612
|
+
|
4613
|
+
for (int j = 0; j < QK8_0; j++) {
|
4614
|
+
const float v = xi[j];
|
4615
|
+
amax = fmaxf(amax, fabsf(v));
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const float d = amax / ((1 << 7) - 1);
|
4619
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4620
|
+
|
4621
|
+
dsti->d = d;
|
4622
|
+
|
4623
|
+
for (int j = 0; j < QK8_0; ++j) {
|
4624
|
+
const float x0 = xi[j]*id;
|
4625
|
+
|
4626
|
+
dsti->qs[j] = roundf(x0);
|
4627
|
+
}
|
4628
|
+
}
|
4629
|
+
|
4630
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
4631
|
+
const float * xi = (const float *) cxi;
|
4632
|
+
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
4633
|
+
|
4634
|
+
float amax = 0.0f;
|
4635
|
+
float vmax = 0.0f;
|
4636
|
+
|
4637
|
+
for (int j = 0; j < QK4_0; ++j) {
|
4638
|
+
const float v = xi[j];
|
4639
|
+
if (amax < fabsf(v)) {
|
4640
|
+
amax = fabsf(v);
|
4641
|
+
vmax = v;
|
4642
|
+
}
|
4643
|
+
}
|
4644
|
+
|
4645
|
+
const float d = vmax / -8;
|
4646
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4647
|
+
|
4648
|
+
dsti->d = d;
|
4649
|
+
|
4650
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
4651
|
+
const float x0 = xi[0 + j]*id;
|
4652
|
+
const float x1 = xi[QK4_0/2 + j]*id;
|
4653
|
+
|
4654
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
4655
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
4656
|
+
|
4657
|
+
dsti->qs[j] = xi0;
|
4658
|
+
dsti->qs[j] |= xi1 << 4;
|
4659
|
+
}
|
4660
|
+
}
|
4661
|
+
|
4662
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
4663
|
+
const float * xi = (const float *) cxi;
|
4664
|
+
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
4665
|
+
|
4666
|
+
float vmin = FLT_MAX;
|
4667
|
+
float vmax = -FLT_MAX;
|
4668
|
+
|
4669
|
+
for (int j = 0; j < QK4_1; ++j) {
|
4670
|
+
const float v = xi[j];
|
4671
|
+
|
4672
|
+
if (v < vmin) vmin = v;
|
4673
|
+
if (v > vmax) vmax = v;
|
4674
|
+
}
|
4675
|
+
|
4676
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
4677
|
+
const float id = d ? 1.0f/d : 0.0f;
|
4678
|
+
|
4679
|
+
dsti->dm.x = d;
|
4680
|
+
dsti->dm.y = vmin;
|
4681
|
+
|
4682
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
4683
|
+
const float x0 = (xi[0 + j] - vmin)*id;
|
4684
|
+
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
4685
|
+
|
4686
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
4687
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
4688
|
+
|
4689
|
+
dsti->qs[j] = xi0;
|
4690
|
+
dsti->qs[j] |= xi1 << 4;
|
4691
|
+
}
|
4692
|
+
}
|
4693
|
+
|
4694
|
+
template <cpy_kernel_t cpy_blck, int qk>
|
4695
|
+
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
4696
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
4697
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
4698
|
+
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
4699
|
+
|
4700
|
+
if (i >= ne) {
|
4701
|
+
return;
|
4702
|
+
}
|
4703
|
+
|
4704
|
+
const int i02 = i / (ne00*ne01);
|
4705
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
4706
|
+
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
4707
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
4708
|
+
|
4709
|
+
const int i12 = i / (ne10*ne11);
|
4710
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
4711
|
+
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
4712
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
4713
|
+
|
4714
|
+
cpy_blck(cx + x_offset, cdst + dst_offset);
|
4715
|
+
}
|
4716
|
+
|
4553
4717
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4554
4718
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4555
4719
|
return 1.0f - min(1.0f, max(0.0f, y));
|
@@ -4610,8 +4774,8 @@ static __global__ void rope(
|
|
4610
4774
|
|
4611
4775
|
template<typename T, bool has_pos>
|
4612
4776
|
static __global__ void rope_neox(
|
4613
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4614
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4777
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4778
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4615
4779
|
) {
|
4616
4780
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4617
4781
|
|
@@ -4620,23 +4784,25 @@ static __global__ void rope_neox(
|
|
4620
4784
|
}
|
4621
4785
|
|
4622
4786
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4623
|
-
const int
|
4787
|
+
const int ib = col / n_dims;
|
4788
|
+
const int ic = col % n_dims;
|
4789
|
+
|
4790
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4624
4791
|
const int i2 = row/p_delta_rows;
|
4625
4792
|
|
4626
|
-
|
4627
|
-
const float cur_rot = -float(col)/ncols;
|
4793
|
+
float cur_rot = inv_ndims * ic - ib;
|
4628
4794
|
|
4629
4795
|
const int p = has_pos ? pos[i2] : 0;
|
4630
|
-
const float theta_base = p*powf(
|
4796
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4631
4797
|
|
4632
4798
|
float cos_theta, sin_theta;
|
4633
4799
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4634
4800
|
|
4635
4801
|
const float x0 = x[i + 0];
|
4636
|
-
const float x1 = x[i +
|
4802
|
+
const float x1 = x[i + n_dims/2];
|
4637
4803
|
|
4638
|
-
dst[i + 0]
|
4639
|
-
dst[i +
|
4804
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4805
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4640
4806
|
}
|
4641
4807
|
|
4642
4808
|
static __global__ void rope_glm_f32(
|
@@ -4702,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
4702
4868
|
dst[i] = col * m_k + x[i];
|
4703
4869
|
}
|
4704
4870
|
|
4871
|
+
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
4872
|
+
const int row = blockIdx.y;
|
4873
|
+
const int col = threadIdx.x;
|
4874
|
+
|
4875
|
+
float sum = 0.0f;
|
4876
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
4877
|
+
sum += x[row * ncols + i];
|
4878
|
+
}
|
4879
|
+
|
4880
|
+
sum = warp_reduce_sum(sum);
|
4881
|
+
|
4882
|
+
if (col == 0) {
|
4883
|
+
dst[row] = sum;
|
4884
|
+
}
|
4885
|
+
}
|
4886
|
+
|
4887
|
+
template<typename T>
|
4888
|
+
static inline __device__ void swap(T & a, T & b) {
|
4889
|
+
T tmp = a;
|
4890
|
+
a = b;
|
4891
|
+
b = tmp;
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
template<ggml_sort_order order>
|
4895
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
|
4896
|
+
// bitonic sort
|
4897
|
+
int col = threadIdx.x;
|
4898
|
+
int row = blockIdx.y;
|
4899
|
+
|
4900
|
+
if (col >= ncols) return;
|
4901
|
+
|
4902
|
+
const float * x_row = x + row * ncols;
|
4903
|
+
int * dst_row = dst + row * ncols;
|
4904
|
+
|
4905
|
+
// initialize indices
|
4906
|
+
if (col < ncols) {
|
4907
|
+
dst_row[col] = col;
|
4908
|
+
}
|
4909
|
+
__syncthreads();
|
4910
|
+
|
4911
|
+
for (int k = 2; k <= ncols; k *= 2) {
|
4912
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
4913
|
+
int ixj = col ^ j;
|
4914
|
+
if (ixj > col) {
|
4915
|
+
if ((col & k) == 0) {
|
4916
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
4917
|
+
swap(dst_row[col], dst_row[ixj]);
|
4918
|
+
}
|
4919
|
+
} else {
|
4920
|
+
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
4921
|
+
swap(dst_row[col], dst_row[ixj]);
|
4922
|
+
}
|
4923
|
+
}
|
4924
|
+
}
|
4925
|
+
__syncthreads();
|
4926
|
+
}
|
4927
|
+
}
|
4928
|
+
}
|
4929
|
+
|
4705
4930
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4706
4931
|
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4707
4932
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
@@ -4711,49 +4936,79 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4711
4936
|
}
|
4712
4937
|
|
4713
4938
|
const int i = row*ncols + col;
|
4714
|
-
//
|
4715
|
-
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4939
|
+
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
|
4940
|
+
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4941
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
4716
4942
|
}
|
4717
4943
|
|
4718
|
-
|
4719
|
-
|
4720
|
-
|
4721
|
-
const int
|
4722
|
-
|
4723
|
-
const int
|
4944
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4945
|
+
const int tid = threadIdx.x;
|
4946
|
+
const int rowx = blockIdx.x;
|
4947
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4948
|
+
|
4949
|
+
const int block_size = blockDim.x;
|
4950
|
+
|
4951
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4952
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4953
|
+
|
4954
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4724
4955
|
|
4725
4956
|
float max_val = -INFINITY;
|
4726
4957
|
|
4727
4958
|
for (int col = tid; col < ncols; col += block_size) {
|
4728
|
-
const int
|
4729
|
-
|
4959
|
+
const int ix = rowx*ncols + col;
|
4960
|
+
const int iy = rowy*ncols + col;
|
4961
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4730
4962
|
}
|
4731
4963
|
|
4732
4964
|
// find the max value in the block
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4965
|
+
max_val = warp_reduce_max(max_val);
|
4966
|
+
if (block_size > WARP_SIZE) {
|
4967
|
+
if (warp_id == 0) {
|
4968
|
+
buf[lane_id] = -INFINITY;
|
4969
|
+
}
|
4970
|
+
__syncthreads();
|
4971
|
+
|
4972
|
+
if (lane_id == 0) {
|
4973
|
+
buf[warp_id] = max_val;
|
4974
|
+
}
|
4975
|
+
__syncthreads();
|
4976
|
+
|
4977
|
+
max_val = buf[lane_id];
|
4978
|
+
max_val = warp_reduce_max(max_val);
|
4736
4979
|
}
|
4737
4980
|
|
4738
4981
|
float tmp = 0.f;
|
4739
4982
|
|
4740
4983
|
for (int col = tid; col < ncols; col += block_size) {
|
4741
|
-
const int
|
4742
|
-
const
|
4984
|
+
const int ix = rowx*ncols + col;
|
4985
|
+
const int iy = rowy*ncols + col;
|
4986
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4743
4987
|
tmp += val;
|
4744
|
-
dst[
|
4988
|
+
dst[ix] = val;
|
4745
4989
|
}
|
4746
4990
|
|
4747
|
-
// sum
|
4748
|
-
|
4749
|
-
|
4750
|
-
|
4991
|
+
// find the sum of exps in the block
|
4992
|
+
tmp = warp_reduce_sum(tmp);
|
4993
|
+
if (block_size > WARP_SIZE) {
|
4994
|
+
if (warp_id == 0) {
|
4995
|
+
buf[lane_id] = 0.f;
|
4996
|
+
}
|
4997
|
+
__syncthreads();
|
4998
|
+
|
4999
|
+
if (lane_id == 0) {
|
5000
|
+
buf[warp_id] = tmp;
|
5001
|
+
}
|
5002
|
+
__syncthreads();
|
5003
|
+
|
5004
|
+
tmp = buf[lane_id];
|
5005
|
+
tmp = warp_reduce_sum(tmp);
|
4751
5006
|
}
|
4752
5007
|
|
4753
5008
|
const float inv_tmp = 1.f / tmp;
|
4754
5009
|
|
4755
5010
|
for (int col = tid; col < ncols; col += block_size) {
|
4756
|
-
const int i =
|
5011
|
+
const int i = rowx*ncols + col;
|
4757
5012
|
dst[i] *= inv_tmp;
|
4758
5013
|
}
|
4759
5014
|
}
|
@@ -4805,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
|
|
4805
5060
|
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4806
5061
|
}
|
4807
5062
|
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
|
4812
|
-
|
4813
|
-
|
4814
|
-
|
4815
|
-
|
4816
|
-
|
4817
|
-
|
4818
|
-
|
4819
|
-
|
4820
|
-
|
4821
|
-
|
5063
|
+
template<float (*bin_op)(const float, const float)>
|
5064
|
+
struct bin_bcast_cuda {
|
5065
|
+
template<typename src0_t, typename src1_t, typename dst_t>
|
5066
|
+
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
|
5067
|
+
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
|
5068
|
+
cudaStream_t stream) {
|
5069
|
+
|
5070
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
5071
|
+
|
5072
|
+
|
5073
|
+
int nr0 = ne10/ne0;
|
5074
|
+
int nr1 = ne11/ne1;
|
5075
|
+
int nr2 = ne12/ne2;
|
5076
|
+
int nr3 = ne13/ne3;
|
5077
|
+
|
5078
|
+
int nr[4] = { nr0, nr1, nr2, nr3 };
|
5079
|
+
|
5080
|
+
// collapse dimensions until first broadcast dimension
|
5081
|
+
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
5082
|
+
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
5083
|
+
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
5084
|
+
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
5085
|
+
auto collapse = [](int64_t cne[]) {
|
5086
|
+
cne[0] *= cne[1];
|
5087
|
+
cne[1] = cne[2];
|
5088
|
+
cne[2] = cne[3];
|
5089
|
+
cne[3] = 1;
|
5090
|
+
};
|
5091
|
+
|
5092
|
+
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5093
|
+
cnb[1] *= cne[1];
|
5094
|
+
cnb[2] *= cne[2];
|
5095
|
+
cnb[3] *= cne[3];
|
5096
|
+
};
|
5097
|
+
|
5098
|
+
for (int i = 0; i < 4; i++) {
|
5099
|
+
if (nr[i] != 1) {
|
5100
|
+
break;
|
5101
|
+
}
|
5102
|
+
if (i > 0) {
|
5103
|
+
collapse_nb(cnb0, cne0);
|
5104
|
+
collapse_nb(cnb1, cne1);
|
5105
|
+
collapse(cne0);
|
5106
|
+
collapse(cne1);
|
5107
|
+
}
|
5108
|
+
}
|
5109
|
+
{
|
5110
|
+
int64_t ne0 = cne0[0];
|
5111
|
+
int64_t ne1 = cne0[1];
|
5112
|
+
int64_t ne2 = cne0[2];
|
5113
|
+
int64_t ne3 = cne0[3];
|
5114
|
+
|
5115
|
+
int64_t ne10 = cne1[0];
|
5116
|
+
int64_t ne11 = cne1[1];
|
5117
|
+
int64_t ne12 = cne1[2];
|
5118
|
+
int64_t ne13 = cne1[3];
|
5119
|
+
|
5120
|
+
//size_t nb0 = cnb0[0];
|
5121
|
+
size_t nb1 = cnb0[1];
|
5122
|
+
size_t nb2 = cnb0[2];
|
5123
|
+
size_t nb3 = cnb0[3];
|
5124
|
+
|
5125
|
+
//size_t nb10 = cnb1[0];
|
5126
|
+
size_t nb11 = cnb1[1];
|
5127
|
+
size_t nb12 = cnb1[2];
|
5128
|
+
size_t nb13 = cnb1[3];
|
5129
|
+
|
5130
|
+
//size_t s0 = nb0 / sizeof(src1_t);
|
5131
|
+
size_t s1 = nb1 / sizeof(src1_t);
|
5132
|
+
size_t s2 = nb2 / sizeof(src1_t);
|
5133
|
+
size_t s3 = nb3 / sizeof(src1_t);
|
5134
|
+
|
5135
|
+
//size_t s10 = nb10 / sizeof(src1_t);
|
5136
|
+
size_t s11 = nb11 / sizeof(src1_t);
|
5137
|
+
size_t s12 = nb12 / sizeof(src1_t);
|
5138
|
+
size_t s13 = nb13 / sizeof(src1_t);
|
5139
|
+
|
5140
|
+
|
5141
|
+
const int block_size = 128;
|
5142
|
+
|
5143
|
+
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
5144
|
+
|
5145
|
+
dim3 block_dims;
|
5146
|
+
block_dims.x = std::min<unsigned int>(hne0, block_size);
|
5147
|
+
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
|
5148
|
+
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
|
5149
|
+
|
5150
|
+
dim3 block_nums(
|
5151
|
+
(hne0 + block_dims.x - 1) / block_dims.x,
|
5152
|
+
(ne1 + block_dims.y - 1) / block_dims.y,
|
5153
|
+
(ne2*ne3 + block_dims.z - 1) / block_dims.z
|
5154
|
+
);
|
4822
5155
|
|
4823
|
-
|
4824
|
-
|
4825
|
-
|
4826
|
-
|
5156
|
+
if (block_nums.z > 65535) {
|
5157
|
+
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
5158
|
+
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
5159
|
+
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
|
5160
|
+
src0_dd, src1_dd, dst_dd,
|
5161
|
+
ne0, ne1, ne2, ne3,
|
5162
|
+
ne10, ne11, ne12, ne13,
|
5163
|
+
/* s0, */ s1, s2, s3,
|
5164
|
+
/* s10, */ s11, s12, s13);
|
5165
|
+
} else {
|
5166
|
+
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
|
5167
|
+
src0_dd, src1_dd, dst_dd,
|
5168
|
+
ne0, ne1, ne2, ne3,
|
5169
|
+
ne10, ne11, ne12, ne13,
|
5170
|
+
/* s0, */ s1, s2, s3,
|
5171
|
+
/* s10, */ s11, s12, s13);
|
5172
|
+
}
|
5173
|
+
}
|
5174
|
+
}
|
5175
|
+
};
|
4827
5176
|
|
4828
5177
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4829
5178
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
@@ -4845,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
4845
5194
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4846
5195
|
}
|
4847
5196
|
|
4848
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5197
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4849
5198
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4850
5199
|
if (ncols < 1024) {
|
4851
5200
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4852
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5201
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4853
5202
|
} else {
|
4854
5203
|
const dim3 block_dims(1024, 1, 1);
|
4855
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
5204
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4856
5205
|
}
|
4857
5206
|
}
|
4858
5207
|
|
@@ -4874,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4874
5223
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4875
5224
|
}
|
4876
5225
|
|
4877
|
-
template<typename dst_t>
|
4878
|
-
static void
|
4879
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4880
|
-
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4881
|
-
}
|
4882
|
-
|
4883
|
-
template<typename dst_t>
|
4884
|
-
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4885
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4886
|
-
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4887
|
-
}
|
4888
|
-
|
4889
|
-
template<typename dst_t>
|
4890
|
-
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4891
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4892
|
-
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4893
|
-
}
|
4894
|
-
|
4895
|
-
template<typename dst_t>
|
4896
|
-
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4897
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4898
|
-
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4899
|
-
}
|
4900
|
-
|
4901
|
-
template<typename dst_t>
|
4902
|
-
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
5226
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
5227
|
+
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
4903
5228
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4904
|
-
dequantize_block<
|
5229
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4905
5230
|
}
|
4906
5231
|
|
4907
5232
|
template<typename dst_t>
|
@@ -4950,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4950
5275
|
#endif
|
4951
5276
|
}
|
4952
5277
|
|
5278
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5279
|
+
switch (type) {
|
5280
|
+
case GGML_TYPE_Q4_0:
|
5281
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5282
|
+
case GGML_TYPE_Q4_1:
|
5283
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5284
|
+
case GGML_TYPE_Q5_0:
|
5285
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5286
|
+
case GGML_TYPE_Q5_1:
|
5287
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5288
|
+
case GGML_TYPE_Q8_0:
|
5289
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5290
|
+
case GGML_TYPE_Q2_K:
|
5291
|
+
return dequantize_row_q2_K_cuda;
|
5292
|
+
case GGML_TYPE_Q3_K:
|
5293
|
+
return dequantize_row_q3_K_cuda;
|
5294
|
+
case GGML_TYPE_Q4_K:
|
5295
|
+
return dequantize_row_q4_K_cuda;
|
5296
|
+
case GGML_TYPE_Q5_K:
|
5297
|
+
return dequantize_row_q5_K_cuda;
|
5298
|
+
case GGML_TYPE_Q6_K:
|
5299
|
+
return dequantize_row_q6_K_cuda;
|
5300
|
+
case GGML_TYPE_F32:
|
5301
|
+
return dequantize_block_cuda<1, 1, convert_f32>;
|
5302
|
+
default:
|
5303
|
+
return nullptr;
|
5304
|
+
}
|
5305
|
+
}
|
5306
|
+
|
5307
|
+
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5308
|
+
switch (type) {
|
5309
|
+
case GGML_TYPE_Q4_0:
|
5310
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
5311
|
+
case GGML_TYPE_Q4_1:
|
5312
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
5313
|
+
case GGML_TYPE_Q5_0:
|
5314
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
5315
|
+
case GGML_TYPE_Q5_1:
|
5316
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
5317
|
+
case GGML_TYPE_Q8_0:
|
5318
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
5319
|
+
case GGML_TYPE_Q2_K:
|
5320
|
+
return dequantize_row_q2_K_cuda;
|
5321
|
+
case GGML_TYPE_Q3_K:
|
5322
|
+
return dequantize_row_q3_K_cuda;
|
5323
|
+
case GGML_TYPE_Q4_K:
|
5324
|
+
return dequantize_row_q4_K_cuda;
|
5325
|
+
case GGML_TYPE_Q5_K:
|
5326
|
+
return dequantize_row_q5_K_cuda;
|
5327
|
+
case GGML_TYPE_Q6_K:
|
5328
|
+
return dequantize_row_q6_K_cuda;
|
5329
|
+
case GGML_TYPE_F16:
|
5330
|
+
return dequantize_block_cuda<1, 1, convert_f16>;
|
5331
|
+
default:
|
5332
|
+
return nullptr;
|
5333
|
+
}
|
5334
|
+
}
|
5335
|
+
|
4953
5336
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4954
5337
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4955
5338
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -5038,13 +5421,22 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
5038
5421
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5039
5422
|
}
|
5040
5423
|
|
5041
|
-
static void
|
5042
|
-
GGML_ASSERT(ncols %
|
5424
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5425
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5043
5426
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5044
5427
|
const dim3 block_nums(block_num_y, 1, 1);
|
5045
5428
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5046
|
-
|
5047
|
-
<<<block_nums, block_dims, 0, stream>>>(vx,
|
5429
|
+
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5430
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5431
|
+
}
|
5432
|
+
|
5433
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5434
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
5435
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5436
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5437
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5438
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
5439
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5048
5440
|
}
|
5049
5441
|
|
5050
5442
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
@@ -5128,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5128
5520
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
5129
5521
|
}
|
5130
5522
|
|
5131
|
-
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
5132
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
5133
|
-
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5134
|
-
}
|
5135
|
-
|
5136
|
-
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
5137
|
-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
5138
|
-
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
5139
|
-
}
|
5140
|
-
|
5141
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5142
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5143
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5144
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
5145
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5146
|
-
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5147
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
5148
|
-
}
|
5149
|
-
|
5150
|
-
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
5151
|
-
switch (type) {
|
5152
|
-
case GGML_TYPE_Q4_0:
|
5153
|
-
return dequantize_row_q4_0_cuda;
|
5154
|
-
case GGML_TYPE_Q4_1:
|
5155
|
-
return dequantize_row_q4_1_cuda;
|
5156
|
-
case GGML_TYPE_Q5_0:
|
5157
|
-
return dequantize_row_q5_0_cuda;
|
5158
|
-
case GGML_TYPE_Q5_1:
|
5159
|
-
return dequantize_row_q5_1_cuda;
|
5160
|
-
case GGML_TYPE_Q8_0:
|
5161
|
-
return dequantize_row_q8_0_cuda;
|
5162
|
-
case GGML_TYPE_Q2_K:
|
5163
|
-
return dequantize_row_q2_K_cuda;
|
5164
|
-
case GGML_TYPE_Q3_K:
|
5165
|
-
return dequantize_row_q3_K_cuda;
|
5166
|
-
case GGML_TYPE_Q4_K:
|
5167
|
-
return dequantize_row_q4_K_cuda;
|
5168
|
-
case GGML_TYPE_Q5_K:
|
5169
|
-
return dequantize_row_q5_K_cuda;
|
5170
|
-
case GGML_TYPE_Q6_K:
|
5171
|
-
return dequantize_row_q6_K_cuda;
|
5172
|
-
case GGML_TYPE_F32:
|
5173
|
-
return convert_fp32_to_fp16_cuda;
|
5174
|
-
default:
|
5175
|
-
return nullptr;
|
5176
|
-
}
|
5177
|
-
}
|
5178
|
-
|
5179
|
-
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
5180
|
-
switch (type) {
|
5181
|
-
case GGML_TYPE_Q4_0:
|
5182
|
-
return dequantize_row_q4_0_cuda;
|
5183
|
-
case GGML_TYPE_Q4_1:
|
5184
|
-
return dequantize_row_q4_1_cuda;
|
5185
|
-
case GGML_TYPE_Q5_0:
|
5186
|
-
return dequantize_row_q5_0_cuda;
|
5187
|
-
case GGML_TYPE_Q5_1:
|
5188
|
-
return dequantize_row_q5_1_cuda;
|
5189
|
-
case GGML_TYPE_Q8_0:
|
5190
|
-
return dequantize_row_q8_0_cuda;
|
5191
|
-
case GGML_TYPE_Q2_K:
|
5192
|
-
return dequantize_row_q2_K_cuda;
|
5193
|
-
case GGML_TYPE_Q3_K:
|
5194
|
-
return dequantize_row_q3_K_cuda;
|
5195
|
-
case GGML_TYPE_Q4_K:
|
5196
|
-
return dequantize_row_q4_K_cuda;
|
5197
|
-
case GGML_TYPE_Q5_K:
|
5198
|
-
return dequantize_row_q5_K_cuda;
|
5199
|
-
case GGML_TYPE_Q6_K:
|
5200
|
-
return dequantize_row_q6_K_cuda;
|
5201
|
-
case GGML_TYPE_F16:
|
5202
|
-
return convert_fp16_to_fp32_cuda;
|
5203
|
-
default:
|
5204
|
-
return nullptr;
|
5205
|
-
}
|
5206
|
-
}
|
5207
|
-
|
5208
5523
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
5209
5524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
5210
5525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
@@ -5697,6 +6012,39 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5697
6012
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5698
6013
|
}
|
5699
6014
|
|
6015
|
+
static void ggml_cpy_f32_q8_0_cuda(
|
6016
|
+
const char * cx, char * cdst, const int ne,
|
6017
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6018
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6019
|
+
|
6020
|
+
GGML_ASSERT(ne % QK8_0 == 0);
|
6021
|
+
const int num_blocks = ne / QK8_0;
|
6022
|
+
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
6023
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6024
|
+
}
|
6025
|
+
|
6026
|
+
static void ggml_cpy_f32_q4_0_cuda(
|
6027
|
+
const char * cx, char * cdst, const int ne,
|
6028
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6029
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6030
|
+
|
6031
|
+
GGML_ASSERT(ne % QK4_0 == 0);
|
6032
|
+
const int num_blocks = ne / QK4_0;
|
6033
|
+
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
6034
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6035
|
+
}
|
6036
|
+
|
6037
|
+
static void ggml_cpy_f32_q4_1_cuda(
|
6038
|
+
const char * cx, char * cdst, const int ne,
|
6039
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
6040
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
6041
|
+
|
6042
|
+
GGML_ASSERT(ne % QK4_1 == 0);
|
6043
|
+
const int num_blocks = ne / QK4_1;
|
6044
|
+
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
6045
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
6046
|
+
}
|
6047
|
+
|
5700
6048
|
static void ggml_cpy_f16_f16_cuda(
|
5701
6049
|
const char * cx, char * cdst, const int ne,
|
5702
6050
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -5739,20 +6087,26 @@ static void rope_cuda(
|
|
5739
6087
|
|
5740
6088
|
template<typename T>
|
5741
6089
|
static void rope_neox_cuda(
|
5742
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
6090
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5743
6091
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5744
6092
|
) {
|
5745
6093
|
GGML_ASSERT(ncols % 2 == 0);
|
5746
6094
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5747
6095
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5748
6096
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
6097
|
+
|
6098
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6099
|
+
const float inv_ndims = -1.0f / n_dims;
|
6100
|
+
|
5749
6101
|
if (pos == nullptr) {
|
5750
6102
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5751
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
6103
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
6104
|
+
theta_scale, inv_ndims
|
5752
6105
|
);
|
5753
6106
|
} else {
|
5754
6107
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5755
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
6108
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
6109
|
+
theta_scale, inv_ndims
|
5756
6110
|
);
|
5757
6111
|
}
|
5758
6112
|
}
|
@@ -5777,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
5777
6131
|
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
5778
6132
|
}
|
5779
6133
|
|
6134
|
+
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6135
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
6136
|
+
const dim3 block_nums(1, nrows, 1);
|
6137
|
+
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6138
|
+
}
|
6139
|
+
|
6140
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
6141
|
+
// bitonic sort requires ncols to be power of 2
|
6142
|
+
GGML_ASSERT((ncols & (ncols - 1)) == 0);
|
6143
|
+
|
6144
|
+
const dim3 block_dims(ncols, 1, 1);
|
6145
|
+
const dim3 block_nums(1, nrows, 1);
|
6146
|
+
if (order == GGML_SORT_ASC) {
|
6147
|
+
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6148
|
+
} else if (order == GGML_SORT_DESC) {
|
6149
|
+
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
6150
|
+
} else {
|
6151
|
+
GGML_ASSERT(false);
|
6152
|
+
}
|
6153
|
+
}
|
6154
|
+
|
5780
6155
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
5781
6156
|
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
5782
6157
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -5784,10 +6159,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5784
6159
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5785
6160
|
}
|
5786
6161
|
|
5787
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5788
|
-
|
6162
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
6163
|
+
int nth = WARP_SIZE;
|
6164
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
6165
|
+
const dim3 block_dims(nth, 1, 1);
|
5789
6166
|
const dim3 block_nums(nrows_x, 1, 1);
|
5790
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
6167
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5791
6168
|
}
|
5792
6169
|
|
5793
6170
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -5867,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5867
6244
|
return ptr;
|
5868
6245
|
}
|
5869
6246
|
#ifdef DEBUG_CUDA_MALLOC
|
5870
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
6247
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
5871
6248
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5872
6249
|
#endif
|
5873
6250
|
void * ptr;
|
@@ -6005,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6005
6382
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
6006
6383
|
// This can fixed the OOM error in WSL.
|
6007
6384
|
cudaGetLastError();
|
6008
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6385
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6009
6386
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
6010
6387
|
return nullptr;
|
6011
6388
|
}
|
@@ -6050,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6050
6427
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6051
6428
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6052
6429
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6053
|
-
}
|
6054
|
-
if (nb0 == ts) {
|
6430
|
+
} else if (nb0 == ts) {
|
6055
6431
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6056
|
-
}
|
6057
|
-
|
6058
|
-
|
6059
|
-
|
6060
|
-
|
6061
|
-
|
6062
|
-
|
6063
|
-
}
|
6064
|
-
return cudaSuccess;
|
6065
|
-
}
|
6066
|
-
|
6067
|
-
static void ggml_cuda_op_repeat(
|
6068
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6069
|
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
6070
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
6071
|
-
const int64_t ne0 = dst->ne[0];
|
6072
|
-
const int64_t ne1 = dst->ne[1];
|
6073
|
-
const int64_t ne2 = dst->ne[2];
|
6074
|
-
const int64_t ne3 = dst->ne[3];
|
6075
|
-
|
6076
|
-
const int64_t ne00 = src0->ne[0];
|
6077
|
-
const int64_t ne01 = src0->ne[1];
|
6078
|
-
const int64_t ne02 = src0->ne[2];
|
6079
|
-
const int64_t ne03 = src0->ne[3];
|
6080
|
-
|
6081
|
-
const size_t nb0 = dst->nb[0];
|
6082
|
-
const size_t nb1 = dst->nb[1];
|
6083
|
-
const size_t nb2 = dst->nb[2];
|
6084
|
-
const size_t nb3 = dst->nb[3];
|
6085
|
-
|
6086
|
-
const size_t nb00 = src0->nb[0];
|
6087
|
-
const size_t nb01 = src0->nb[1];
|
6088
|
-
const size_t nb02 = src0->nb[2];
|
6089
|
-
const size_t nb03 = src0->nb[3];
|
6090
|
-
|
6091
|
-
const int nr0 = (int)(ne0/ne00);
|
6092
|
-
const int nr1 = (int)(ne1/ne01);
|
6093
|
-
const int nr2 = (int)(ne2/ne02);
|
6094
|
-
const int nr3 = (int)(ne3/ne03);
|
6095
|
-
|
6096
|
-
// TODO: support for transposed / permuted tensors
|
6097
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
6098
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
6099
|
-
|
6100
|
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
6101
|
-
for (int i3 = 0; i3 < nr3; i3++) {
|
6102
|
-
for (int k3 = 0; k3 < ne03; k3++) {
|
6103
|
-
for (int i2 = 0; i2 < nr2; i2++) {
|
6104
|
-
for (int k2 = 0; k2 < ne02; k2++) {
|
6105
|
-
for (int i1 = 0; i1 < nr1; i1++) {
|
6106
|
-
for (int k1 = 0; k1 < ne01; k1++) {
|
6107
|
-
for (int i0 = 0; i0 < nr0; i0++) {
|
6108
|
-
CUDA_CHECK(cudaMemcpyAsync(
|
6109
|
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
6110
|
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
6111
|
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
6112
|
-
}
|
6113
|
-
}
|
6114
|
-
}
|
6115
|
-
}
|
6116
|
-
}
|
6432
|
+
} else {
|
6433
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6434
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6435
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6436
|
+
// pretend the row is a matrix with cols=1
|
6437
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6438
|
+
if (r != cudaSuccess) return r;
|
6117
6439
|
}
|
6440
|
+
return cudaSuccess;
|
6118
6441
|
}
|
6119
|
-
|
6120
|
-
(void) src1;
|
6121
|
-
(void) src1_d;
|
6122
6442
|
}
|
6123
6443
|
|
6124
6444
|
static void ggml_cuda_op_get_rows(
|
@@ -6165,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
|
|
6165
6485
|
}
|
6166
6486
|
}
|
6167
6487
|
|
6168
|
-
|
6488
|
+
template<class op>
|
6489
|
+
inline void ggml_cuda_op_bin_bcast(
|
6169
6490
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6170
6491
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6171
6492
|
|
6172
6493
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6173
6494
|
|
6174
|
-
const int64_t ne10 = src1->ne[0];
|
6175
|
-
const int64_t ne11 = src1->ne[1];
|
6176
|
-
|
6177
6495
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
6178
|
-
|
6496
|
+
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6179
6497
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
6180
|
-
|
6498
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
|
6181
6499
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6182
|
-
|
6500
|
+
op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
|
6183
6501
|
} else {
|
6184
|
-
fprintf(stderr, "src0
|
6502
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
6503
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6185
6504
|
GGML_ASSERT(false);
|
6186
6505
|
}
|
6506
|
+
}
|
6507
|
+
|
6508
|
+
static void ggml_cuda_op_repeat(
|
6509
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6510
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
|
6511
|
+
|
6512
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6187
6513
|
|
6188
6514
|
(void) src1;
|
6189
|
-
(void)
|
6515
|
+
(void) src1_d;
|
6190
6516
|
}
|
6191
6517
|
|
6192
|
-
inline void
|
6518
|
+
inline void ggml_cuda_op_add(
|
6193
6519
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6194
6520
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6195
6521
|
|
6196
|
-
|
6197
|
-
|
6198
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6522
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6523
|
+
}
|
6199
6524
|
|
6200
|
-
|
6201
|
-
const
|
6525
|
+
inline void ggml_cuda_op_mul(
|
6526
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6527
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6202
6528
|
|
6203
|
-
|
6529
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6530
|
+
}
|
6204
6531
|
|
6205
|
-
|
6532
|
+
inline void ggml_cuda_op_div(
|
6533
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6534
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6535
|
+
|
6536
|
+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6206
6537
|
}
|
6207
6538
|
|
6208
6539
|
inline void ggml_cuda_op_gelu(
|
@@ -6271,7 +6602,10 @@ inline void ggml_cuda_op_norm(
|
|
6271
6602
|
const int64_t ne00 = src0->ne[0];
|
6272
6603
|
const int64_t nrows = ggml_nrows(src0);
|
6273
6604
|
|
6274
|
-
|
6605
|
+
float eps;
|
6606
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
6607
|
+
|
6608
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
6275
6609
|
|
6276
6610
|
(void) src1;
|
6277
6611
|
(void) dst;
|
@@ -6426,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
6426
6760
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6427
6761
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6428
6762
|
|
6763
|
+
GGML_ASSERT(ggml_nrows(src1) == 1);
|
6764
|
+
|
6429
6765
|
const int64_t ne00 = src0->ne[0];
|
6430
6766
|
const int64_t row_diff = row_high - row_low;
|
6431
6767
|
|
@@ -6485,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
6485
6821
|
size_t ash;
|
6486
6822
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
6487
6823
|
|
6488
|
-
bool src1_convert_f16 =
|
6824
|
+
bool src1_convert_f16 =
|
6825
|
+
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
6489
6826
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
6490
6827
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
6491
6828
|
|
@@ -6707,15 +7044,14 @@ inline void ggml_cuda_op_rope(
|
|
6707
7044
|
GGML_ASSERT(false);
|
6708
7045
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6709
7046
|
} else if (is_neox) {
|
6710
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6711
7047
|
if (src0->type == GGML_TYPE_F32) {
|
6712
7048
|
rope_neox_cuda(
|
6713
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
7049
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6714
7050
|
attn_factor, corr_dims, main_stream
|
6715
7051
|
);
|
6716
7052
|
} else if (src0->type == GGML_TYPE_F16) {
|
6717
7053
|
rope_neox_cuda(
|
6718
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
7054
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6719
7055
|
attn_factor, corr_dims, main_stream
|
6720
7056
|
);
|
6721
7057
|
} else {
|
@@ -6812,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
|
|
6812
7148
|
(void) src0_dd;
|
6813
7149
|
}
|
6814
7150
|
|
7151
|
+
inline void ggml_cuda_op_sum_rows(
|
7152
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7153
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7154
|
+
|
7155
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7156
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7157
|
+
|
7158
|
+
const int64_t ncols = src0->ne[0];
|
7159
|
+
const int64_t nrows = ggml_nrows(src0);
|
7160
|
+
|
7161
|
+
sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
|
7162
|
+
|
7163
|
+
(void) src1;
|
7164
|
+
(void) dst;
|
7165
|
+
(void) src1_dd;
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
inline void ggml_cuda_op_argsort(
|
7169
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7170
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7171
|
+
|
7172
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7173
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
7174
|
+
|
7175
|
+
const int64_t ncols = src0->ne[0];
|
7176
|
+
const int64_t nrows = ggml_nrows(src0);
|
7177
|
+
|
7178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
7179
|
+
|
7180
|
+
argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
7181
|
+
|
7182
|
+
(void) src1;
|
7183
|
+
(void) dst;
|
7184
|
+
(void) src1_dd;
|
7185
|
+
}
|
7186
|
+
|
6815
7187
|
inline void ggml_cuda_op_diag_mask_inf(
|
6816
7188
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6817
7189
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6839,14 +7211,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6839
7211
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6840
7212
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6841
7213
|
|
7214
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
7215
|
+
|
6842
7216
|
const int64_t ne00 = src0->ne[0];
|
6843
|
-
const int64_t
|
7217
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
7218
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6844
7219
|
|
6845
|
-
|
7220
|
+
float scale = 1.0f;
|
7221
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
7222
|
+
|
7223
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6846
7224
|
|
6847
|
-
(void) src1;
|
6848
7225
|
(void) dst;
|
6849
|
-
(void) src1_dd;
|
6850
7226
|
}
|
6851
7227
|
|
6852
7228
|
inline void ggml_cuda_op_scale(
|
@@ -7016,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7016
7392
|
const int64_t ne01 = src0->ne[1];
|
7017
7393
|
const int64_t ne02 = src0->ne[2];
|
7018
7394
|
const int64_t ne03 = src0->ne[3];
|
7019
|
-
|
7395
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
7020
7396
|
|
7021
7397
|
const int64_t ne10 = src1->ne[0];
|
7022
7398
|
const int64_t ne11 = src1->ne[1];
|
@@ -7052,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7052
7428
|
|
7053
7429
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7054
7430
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
7055
|
-
|
7056
7431
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
7057
|
-
|
7058
|
-
|
7432
|
+
|
7433
|
+
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
7059
7434
|
|
7060
7435
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7061
7436
|
GGML_ASSERT(!(split && ne02 > 1));
|
@@ -7180,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7180
7555
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
7181
7556
|
|
7182
7557
|
// for split tensors the data begins at i0 == i0_offset_low
|
7183
|
-
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
7558
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
7184
7559
|
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
7185
7560
|
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
7186
7561
|
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
@@ -7325,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7325
7700
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
7326
7701
|
}
|
7327
7702
|
|
7703
|
+
static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7704
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
|
7705
|
+
}
|
7706
|
+
|
7328
7707
|
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7329
7708
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
7330
7709
|
}
|
@@ -7350,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7350
7729
|
}
|
7351
7730
|
|
7352
7731
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7353
|
-
if (!g_cublas_loaded)
|
7732
|
+
if (!g_cublas_loaded) return false;
|
7354
7733
|
|
7355
7734
|
const int64_t ne10 = src1->ne[0];
|
7356
7735
|
|
@@ -7428,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7428
7807
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7429
7808
|
}
|
7430
7809
|
|
7431
|
-
__global__
|
7810
|
+
static __global__ void k_compute_batched_ptrs(
|
7432
7811
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7433
7812
|
const void ** ptrs_src, void ** ptrs_dst,
|
7434
7813
|
int ne12, int ne13,
|
@@ -7484,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7484
7863
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7485
7864
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7486
7865
|
|
7487
|
-
|
7488
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
7489
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7866
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
7490
7867
|
|
7491
7868
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7492
7869
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -7543,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7543
7920
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7544
7921
|
// use cublasGemmStridedBatchedEx
|
7545
7922
|
CUBLAS_CHECK(
|
7546
|
-
cublasGemmStridedBatchedEx(g_cublas_handles[
|
7923
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7547
7924
|
ne01, ne11, ne10,
|
7548
7925
|
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7549
7926
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
@@ -7577,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7577
7954
|
CUDA_CHECK(cudaGetLastError());
|
7578
7955
|
|
7579
7956
|
CUBLAS_CHECK(
|
7580
|
-
cublasGemmBatchedEx(g_cublas_handles[
|
7957
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
7581
7958
|
ne01, ne11, ne10,
|
7582
7959
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7583
7960
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
@@ -7647,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7647
8024
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7648
8025
|
const bool use_mul_mat_vec_q = false;
|
7649
8026
|
#else
|
7650
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8027
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
7651
8028
|
#endif // GGML_CUDA_FORCE_DMMV
|
7652
8029
|
|
7653
8030
|
if (use_mul_mat_vec_q) {
|
8031
|
+
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
7654
8032
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
7655
8033
|
} else {
|
7656
8034
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
@@ -7675,42 +8053,255 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7675
8053
|
}
|
7676
8054
|
}
|
7677
8055
|
|
7678
|
-
|
7679
|
-
|
7680
|
-
|
8056
|
+
#if 0
|
8057
|
+
template<typename ... Srcs>
|
8058
|
+
static __global__ void k_compute_batched_ptrs_id(
|
8059
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
8060
|
+
int ne12, int ne13,
|
8061
|
+
int ne23,
|
8062
|
+
int nb02, int nb03,
|
8063
|
+
int nb12, int nb13,
|
8064
|
+
int nb2, int nb3,
|
8065
|
+
int r2, int r3,
|
8066
|
+
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
|
8067
|
+
const half * src1_f16, half * dst_f16,
|
8068
|
+
const int32_t * ids, const int id,
|
8069
|
+
Srcs... src0s) {
|
8070
|
+
|
8071
|
+
int i = ids[id];
|
8072
|
+
|
8073
|
+
half * src0_f16;
|
8074
|
+
const void * srcs_ar[] = { (const half *) src0s... };
|
8075
|
+
if (src0_type == GGML_TYPE_F16) {
|
8076
|
+
src0_f16 = (half *) srcs_ar[i];
|
8077
|
+
} else {
|
8078
|
+
src0_f16 = src0_as_f16;
|
8079
|
+
if (threadIdx.x == 0 && threadIdx.y == 0) {
|
8080
|
+
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
|
8081
|
+
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
|
8082
|
+
}
|
8083
|
+
}
|
7681
8084
|
|
7682
|
-
|
7683
|
-
|
8085
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8086
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8087
|
+
|
8088
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
8089
|
+
return;
|
8090
|
+
}
|
8091
|
+
|
8092
|
+
int i03 = i13 / r3;
|
8093
|
+
int i02 = i12 / r2;
|
8094
|
+
|
8095
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
|
8096
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
|
8097
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7684
8098
|
}
|
7685
8099
|
|
7686
|
-
static void
|
7687
|
-
const
|
7688
|
-
|
8100
|
+
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
8101
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8102
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8103
|
+
const struct ggml_tensor * src00 = dst->src[2];
|
7689
8104
|
|
7690
|
-
|
7691
|
-
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8105
|
+
const int id = dst->op_params[0];
|
7692
8106
|
|
7693
|
-
GGML_ASSERT(
|
7694
|
-
GGML_ASSERT(
|
8107
|
+
GGML_ASSERT(!ggml_is_transposed(src00));
|
8108
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7695
8109
|
|
7696
|
-
|
7697
|
-
|
7698
|
-
GGML_ASSERT(src0->ne[3] == 1);
|
8110
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
8111
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7699
8112
|
|
7700
|
-
const int64_t
|
7701
|
-
const int64_t
|
7702
|
-
const int64_t
|
8113
|
+
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
8114
|
+
const int64_t ne01 = src00->ne[1];
|
8115
|
+
const int64_t ne02 = src00->ne[2];
|
8116
|
+
const int64_t ne03 = src00->ne[3];
|
8117
|
+
|
8118
|
+
//const int64_t nb01 = src00->nb[1];
|
8119
|
+
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
8120
|
+
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
7703
8121
|
|
7704
8122
|
const int64_t ne10 = src1->ne[0];
|
7705
8123
|
const int64_t ne11 = src1->ne[1];
|
7706
|
-
|
8124
|
+
const int64_t ne12 = src1->ne[2];
|
8125
|
+
const int64_t ne13 = src1->ne[3];
|
7707
8126
|
|
7708
|
-
const int64_t
|
7709
|
-
const int64_t
|
7710
|
-
const int64_t
|
8127
|
+
//const int64_t nb11 = src1->nb[1];
|
8128
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8129
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7711
8130
|
|
7712
|
-
|
7713
|
-
|
8131
|
+
const int64_t ne1 = ggml_nelements(src1);
|
8132
|
+
const int64_t ne = ggml_nelements(dst);
|
8133
|
+
|
8134
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8135
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8136
|
+
|
8137
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
8138
|
+
|
8139
|
+
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8140
|
+
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
8141
|
+
//half * src0_as_f16 = (half *) src0_ddq;
|
8142
|
+
|
8143
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8144
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
8145
|
+
|
8146
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
8147
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8148
|
+
|
8149
|
+
// convert src1 to fp16
|
8150
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8151
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8152
|
+
|
8153
|
+
size_t src1_as = 0;
|
8154
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
8155
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8156
|
+
|
8157
|
+
size_t dst_as = 0;
|
8158
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8159
|
+
|
8160
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
8161
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
8162
|
+
|
8163
|
+
// broadcast factors
|
8164
|
+
const int64_t r2 = ne12/ne02;
|
8165
|
+
const int64_t r3 = ne13/ne03;
|
8166
|
+
|
8167
|
+
const half alpha_f16 = 1.0f;
|
8168
|
+
const half beta_f16 = 0.0f;
|
8169
|
+
|
8170
|
+
// use cublasGemmBatchedEx
|
8171
|
+
const int ne23 = ne12*ne13;
|
8172
|
+
|
8173
|
+
const void ** ptrs_src = nullptr;
|
8174
|
+
void ** ptrs_dst = nullptr;
|
8175
|
+
|
8176
|
+
size_t ptrs_src_s = 0;
|
8177
|
+
size_t ptrs_dst_s = 0;
|
8178
|
+
|
8179
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8180
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8181
|
+
|
8182
|
+
int64_t src0_ne = ggml_nelements(src00);
|
8183
|
+
half * src0_as_f16 = nullptr;
|
8184
|
+
size_t src0_as = 0;
|
8185
|
+
if (src00->type != GGML_TYPE_F16) {
|
8186
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
|
8187
|
+
}
|
8188
|
+
|
8189
|
+
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
|
8190
|
+
dim3 block_dims(ne13, ne12);
|
8191
|
+
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
|
8192
|
+
ptrs_src, ptrs_dst,
|
8193
|
+
ne12, ne13,
|
8194
|
+
ne23,
|
8195
|
+
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
|
8196
|
+
nb12, nb13,
|
8197
|
+
dst->nb[2], dst->nb[3],
|
8198
|
+
r2, r3,
|
8199
|
+
src00->type, src0_as_f16, src0_ne,
|
8200
|
+
src1_as_f16, dst_f16,
|
8201
|
+
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
|
8202
|
+
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
|
8203
|
+
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
|
8204
|
+
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
8205
|
+
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
8206
|
+
);
|
8207
|
+
CUDA_CHECK(cudaGetLastError());
|
8208
|
+
|
8209
|
+
CUBLAS_CHECK(
|
8210
|
+
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8211
|
+
ne01, ne11, ne10,
|
8212
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
8213
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
|
8214
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
8215
|
+
ne23,
|
8216
|
+
CUBLAS_COMPUTE_16F,
|
8217
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8218
|
+
|
8219
|
+
if (src0_as != 0) {
|
8220
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
8221
|
+
}
|
8222
|
+
if (ptrs_src_s != 0) {
|
8223
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8224
|
+
}
|
8225
|
+
if (ptrs_dst_s != 0) {
|
8226
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8227
|
+
}
|
8228
|
+
|
8229
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8230
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8231
|
+
|
8232
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8233
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8234
|
+
}
|
8235
|
+
#endif
|
8236
|
+
|
8237
|
+
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
|
8238
|
+
#if 0
|
8239
|
+
//#ifdef CUDA_USE_TENSOR_CORES
|
8240
|
+
// const bool use_tensor_cores = true;
|
8241
|
+
//#else
|
8242
|
+
// const bool use_tensor_cores = false;
|
8243
|
+
//#endif
|
8244
|
+
|
8245
|
+
ggml_cuda_mul_mat_id_cublas(dst);
|
8246
|
+
|
8247
|
+
// TODO: mmq/mmv support
|
8248
|
+
#else
|
8249
|
+
const struct ggml_tensor * ids = dst->src[0];
|
8250
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8251
|
+
const int id = dst->op_params[0];
|
8252
|
+
|
8253
|
+
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8254
|
+
|
8255
|
+
int32_t a_id;
|
8256
|
+
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8257
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8258
|
+
|
8259
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
8260
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
8261
|
+
|
8262
|
+
ggml_cuda_mul_mat(src0, src1, dst);
|
8263
|
+
#endif
|
8264
|
+
|
8265
|
+
(void) _src0;
|
8266
|
+
(void) _src1;
|
8267
|
+
}
|
8268
|
+
|
8269
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8270
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
8271
|
+
}
|
8272
|
+
|
8273
|
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8274
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
8275
|
+
}
|
8276
|
+
|
8277
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8278
|
+
const int64_t ne = ggml_nelements(src0);
|
8279
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
8280
|
+
|
8281
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
8282
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8283
|
+
|
8284
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
8285
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
8286
|
+
|
8287
|
+
const int64_t ne00 = src0->ne[0];
|
8288
|
+
const int64_t ne01 = src0->ne[1];
|
8289
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
8290
|
+
|
8291
|
+
const int64_t nb00 = src0->nb[0];
|
8292
|
+
const int64_t nb01 = src0->nb[1];
|
8293
|
+
const int64_t nb02 = src0->nb[2];
|
8294
|
+
|
8295
|
+
const int64_t ne10 = src1->ne[0];
|
8296
|
+
const int64_t ne11 = src1->ne[1];
|
8297
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
8298
|
+
|
8299
|
+
const int64_t nb10 = src1->nb[0];
|
8300
|
+
const int64_t nb11 = src1->nb[1];
|
8301
|
+
const int64_t nb12 = src1->nb[2];
|
8302
|
+
|
8303
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
8304
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7714
8305
|
|
7715
8306
|
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7716
8307
|
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -7719,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7719
8310
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
7720
8311
|
|
7721
8312
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
7722
|
-
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7723
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8313
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7724
8314
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7725
|
-
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7726
|
-
|
8315
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8316
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
8317
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8318
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
8319
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
8320
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
8321
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7727
8322
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7728
|
-
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7729
|
-
ne10, ne11, nb10, nb11, nb12, main_stream);
|
8323
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
7730
8324
|
} else {
|
7731
8325
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7732
8326
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7737,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7737
8331
|
}
|
7738
8332
|
|
7739
8333
|
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8334
|
+
// TODO: why do we pass dst as src1 here?
|
7740
8335
|
ggml_cuda_cpy(src0, dst, nullptr);
|
7741
8336
|
(void) src1;
|
7742
8337
|
}
|
@@ -7762,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7762
8357
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7763
8358
|
}
|
7764
8359
|
|
8360
|
+
static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8361
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8362
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
|
8363
|
+
}
|
8364
|
+
|
8365
|
+
static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
8366
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
8367
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
|
8368
|
+
}
|
8369
|
+
|
7765
8370
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7766
8371
|
(void) src0;
|
7767
8372
|
(void) src1;
|
@@ -8017,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
8017
8622
|
main_device, g_device_count, g_main_device);
|
8018
8623
|
return;
|
8019
8624
|
}
|
8020
|
-
|
8021
|
-
if (g_device_count > 1) {
|
8625
|
+
|
8626
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
8627
|
+
g_main_device = main_device;
|
8022
8628
|
cudaDeviceProp prop;
|
8023
8629
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
8024
8630
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
@@ -8044,7 +8650,7 @@ void ggml_cuda_free_scratch() {
|
|
8044
8650
|
}
|
8045
8651
|
|
8046
8652
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8047
|
-
if (!g_cublas_loaded)
|
8653
|
+
if (!g_cublas_loaded) return false;
|
8048
8654
|
|
8049
8655
|
ggml_cuda_func_t func;
|
8050
8656
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8080,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8080
8686
|
case GGML_OP_MUL:
|
8081
8687
|
func = ggml_cuda_mul;
|
8082
8688
|
break;
|
8689
|
+
case GGML_OP_DIV:
|
8690
|
+
func = ggml_cuda_div;
|
8691
|
+
break;
|
8083
8692
|
case GGML_OP_UNARY:
|
8084
8693
|
switch (ggml_get_unary_op(tensor)) {
|
8085
8694
|
case GGML_UNARY_OP_GELU:
|
@@ -8093,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8093
8702
|
break;
|
8094
8703
|
default:
|
8095
8704
|
return false;
|
8096
|
-
}
|
8705
|
+
}
|
8706
|
+
break;
|
8097
8707
|
case GGML_OP_NORM:
|
8098
8708
|
func = ggml_cuda_norm;
|
8099
8709
|
break;
|
@@ -8106,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8106
8716
|
}
|
8107
8717
|
func = ggml_cuda_mul_mat;
|
8108
8718
|
break;
|
8719
|
+
case GGML_OP_MUL_MAT_ID:
|
8720
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
8721
|
+
return false;
|
8722
|
+
}
|
8723
|
+
func = ggml_cuda_mul_mat_id;
|
8724
|
+
break;
|
8109
8725
|
case GGML_OP_SCALE:
|
8110
8726
|
func = ggml_cuda_scale;
|
8111
8727
|
break;
|
@@ -8145,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8145
8761
|
case GGML_OP_IM2COL:
|
8146
8762
|
func = ggml_cuda_im2col;
|
8147
8763
|
break;
|
8764
|
+
case GGML_OP_SUM_ROWS:
|
8765
|
+
func = ggml_cuda_sum_rows;
|
8766
|
+
break;
|
8767
|
+
case GGML_OP_ARGSORT:
|
8768
|
+
func = ggml_cuda_argsort;
|
8769
|
+
break;
|
8148
8770
|
default:
|
8149
8771
|
return false;
|
8150
8772
|
}
|
@@ -8161,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8161
8783
|
|
8162
8784
|
int ggml_cuda_get_device_count() {
|
8163
8785
|
int device_count;
|
8164
|
-
|
8786
|
+
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
8787
|
+
return 0;
|
8788
|
+
}
|
8165
8789
|
return device_count;
|
8166
8790
|
}
|
8167
8791
|
|
@@ -8177,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
8177
8801
|
|
8178
8802
|
#define UNUSED GGML_UNUSED
|
8179
8803
|
|
8180
|
-
|
8181
|
-
};
|
8182
|
-
|
8183
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
8184
|
-
return GGML_CUDA_NAME;
|
8185
|
-
|
8186
|
-
UNUSED(backend);
|
8187
|
-
}
|
8188
|
-
|
8189
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
8190
|
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
8191
|
-
delete cuda_ctx;
|
8192
|
-
delete backend;
|
8193
|
-
}
|
8804
|
+
// cuda buffer
|
8194
8805
|
|
8195
8806
|
struct ggml_backend_buffer_context_cuda {
|
8196
|
-
|
8197
|
-
|
8807
|
+
int device;
|
8808
|
+
void * dev_ptr = nullptr;
|
8198
8809
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
8199
8810
|
size_t temp_tensor_extra_index = 0;
|
8200
8811
|
|
8812
|
+
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
8813
|
+
|
8201
8814
|
~ggml_backend_buffer_context_cuda() {
|
8202
8815
|
delete[] temp_tensor_extras;
|
8203
8816
|
}
|
@@ -8218,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
|
|
8218
8831
|
|
8219
8832
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8220
8833
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8221
|
-
CUDA_CHECK(cudaFree(ctx->
|
8834
|
+
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
8222
8835
|
delete ctx;
|
8223
8836
|
}
|
8224
8837
|
|
8225
8838
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
8226
8839
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8227
|
-
return ctx->
|
8228
|
-
}
|
8229
|
-
|
8230
|
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8231
|
-
int64_t row_low = 0;
|
8232
|
-
int64_t row_high = ggml_nrows(tensor);
|
8233
|
-
int64_t nrows_split = row_high - row_low;
|
8234
|
-
|
8235
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8236
|
-
|
8237
|
-
int64_t ne0 = tensor->ne[0];
|
8238
|
-
|
8239
|
-
if (ggml_is_quantized(tensor->type)) {
|
8240
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8241
|
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8242
|
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8243
|
-
}
|
8244
|
-
}
|
8245
|
-
|
8246
|
-
return size;
|
8247
|
-
|
8248
|
-
UNUSED(buffer);
|
8840
|
+
return ctx->dev_ptr;
|
8249
8841
|
}
|
8250
8842
|
|
8251
8843
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
8252
8844
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8253
8845
|
|
8254
8846
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
8255
|
-
assert(tensor->view_src->buffer->
|
8847
|
+
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
8256
8848
|
tensor->backend = tensor->view_src->backend;
|
8257
8849
|
tensor->extra = tensor->view_src->extra;
|
8258
8850
|
return;
|
@@ -8260,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8260
8852
|
|
8261
8853
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
8262
8854
|
|
8263
|
-
extra->data_device[
|
8855
|
+
extra->data_device[ctx->device] = tensor->data;
|
8264
8856
|
|
8265
8857
|
tensor->backend = GGML_BACKEND_GPU;
|
8266
8858
|
tensor->extra = extra;
|
@@ -8272,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
8272
8864
|
int64_t nrows_split = row_high - row_low;
|
8273
8865
|
|
8274
8866
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
8275
|
-
size_t padded_size =
|
8867
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
8276
8868
|
|
8277
8869
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
8278
|
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[
|
8870
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
8279
8871
|
}
|
8280
8872
|
}
|
8281
8873
|
|
8282
8874
|
UNUSED(buffer);
|
8283
8875
|
}
|
8284
8876
|
|
8877
|
+
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
8878
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8879
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8880
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8881
|
+
|
8882
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
8883
|
+
|
8884
|
+
UNUSED(buffer);
|
8885
|
+
}
|
8886
|
+
|
8887
|
+
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
8888
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8889
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8890
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
8893
|
+
|
8894
|
+
UNUSED(buffer);
|
8895
|
+
}
|
8896
|
+
|
8285
8897
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
8286
|
-
/* .free_buffer
|
8287
|
-
/* .get_base
|
8288
|
-
/* .
|
8289
|
-
/* .
|
8290
|
-
/* .
|
8898
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
8899
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
8900
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
8901
|
+
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
8902
|
+
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
8903
|
+
/* .cpy_tensor_from = */ NULL,
|
8904
|
+
/* .cpy_tensor_to = */ NULL,
|
8291
8905
|
};
|
8292
8906
|
|
8293
|
-
|
8294
|
-
ggml_cuda_set_device(g_main_device);
|
8907
|
+
// cuda buffer type
|
8295
8908
|
|
8296
|
-
|
8909
|
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8910
|
+
int device = (int) (intptr_t) buft->context;
|
8911
|
+
|
8912
|
+
ggml_cuda_set_device(device);
|
8297
8913
|
|
8298
8914
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8299
8915
|
|
8300
|
-
|
8301
|
-
CUDA_CHECK(cudaMalloc(&
|
8916
|
+
void * dev_ptr;
|
8917
|
+
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
8302
8918
|
|
8303
|
-
|
8919
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
8920
|
+
|
8921
|
+
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
8304
8922
|
}
|
8305
8923
|
|
8306
|
-
static size_t
|
8924
|
+
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
8307
8925
|
return 128;
|
8926
|
+
|
8927
|
+
UNUSED(buft);
|
8928
|
+
}
|
8929
|
+
|
8930
|
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
8931
|
+
int64_t row_low = 0;
|
8932
|
+
int64_t row_high = ggml_nrows(tensor);
|
8933
|
+
int64_t nrows_split = row_high - row_low;
|
8934
|
+
|
8935
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
8936
|
+
|
8937
|
+
int64_t ne0 = tensor->ne[0];
|
8938
|
+
|
8939
|
+
if (ggml_is_quantized(tensor->type)) {
|
8940
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
8941
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
8942
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
8943
|
+
}
|
8944
|
+
}
|
8945
|
+
|
8946
|
+
return size;
|
8947
|
+
|
8948
|
+
UNUSED(buft);
|
8949
|
+
}
|
8950
|
+
|
8951
|
+
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
8952
|
+
return ggml_backend_is_cuda(backend);
|
8953
|
+
|
8954
|
+
UNUSED(buft);
|
8955
|
+
}
|
8956
|
+
|
8957
|
+
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
8958
|
+
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
8959
|
+
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
8960
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
8961
|
+
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
8962
|
+
};
|
8963
|
+
|
8964
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
8965
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
8966
|
+
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
8967
|
+
if (!ggml_backend_buffer_type_cuda_initialized) {
|
8968
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
8969
|
+
ggml_backend_buffer_type_cuda[i] = {
|
8970
|
+
/* .iface = */ cuda_backend_buffer_type_interface,
|
8971
|
+
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
8972
|
+
};
|
8973
|
+
}
|
8974
|
+
ggml_backend_buffer_type_cuda_initialized = true;
|
8975
|
+
}
|
8976
|
+
|
8977
|
+
return &ggml_backend_buffer_type_cuda[device];
|
8978
|
+
}
|
8979
|
+
|
8980
|
+
// host buffer type
|
8981
|
+
|
8982
|
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
8983
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
8984
|
+
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
8985
|
+
delete ctx;
|
8986
|
+
}
|
8987
|
+
|
8988
|
+
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
8989
|
+
void * ptr;
|
8990
|
+
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
8991
|
+
|
8992
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
8993
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
8994
|
+
buffer->buft = buft;
|
8995
|
+
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
8996
|
+
|
8997
|
+
return buffer;
|
8998
|
+
|
8999
|
+
UNUSED(buft);
|
9000
|
+
}
|
9001
|
+
|
9002
|
+
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9003
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9004
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9005
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9006
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9007
|
+
};
|
9008
|
+
|
9009
|
+
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9010
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
9011
|
+
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
9012
|
+
/* .context = */ nullptr,
|
9013
|
+
};
|
9014
|
+
|
9015
|
+
return &ggml_backend_buffer_type_cuda_host;
|
9016
|
+
}
|
9017
|
+
|
9018
|
+
// backend
|
9019
|
+
|
9020
|
+
struct ggml_backend_context_cuda {
|
9021
|
+
int device;
|
9022
|
+
};
|
9023
|
+
|
9024
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
9025
|
+
return GGML_CUDA_NAME;
|
9026
|
+
|
8308
9027
|
UNUSED(backend);
|
8309
9028
|
}
|
8310
9029
|
|
9030
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
9031
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9032
|
+
|
9033
|
+
delete cuda_ctx;
|
9034
|
+
delete backend;
|
9035
|
+
}
|
9036
|
+
|
9037
|
+
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
9038
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9039
|
+
|
9040
|
+
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
9041
|
+
}
|
9042
|
+
|
8311
9043
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9044
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9045
|
+
|
9046
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8312
9047
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
8313
9048
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8314
9049
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8315
9050
|
|
8316
|
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[
|
8317
|
-
|
8318
|
-
UNUSED(backend);
|
9051
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
8319
9052
|
}
|
8320
9053
|
|
8321
9054
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9055
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9056
|
+
|
9057
|
+
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
8322
9058
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
8323
9059
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
8324
9060
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
8325
9061
|
|
8326
|
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[
|
8327
|
-
|
8328
|
-
UNUSED(backend);
|
9062
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
8329
9063
|
}
|
8330
9064
|
|
8331
9065
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
8332
|
-
|
9066
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9067
|
+
|
9068
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
8333
9069
|
|
8334
9070
|
UNUSED(backend);
|
8335
9071
|
}
|
@@ -8343,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8343
9079
|
UNUSED(cgraph);
|
8344
9080
|
}
|
8345
9081
|
|
8346
|
-
|
9082
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8347
9083
|
GGML_ASSERT(!"not implemented");
|
8348
9084
|
|
8349
9085
|
UNUSED(backend);
|
8350
9086
|
UNUSED(plan);
|
8351
9087
|
}
|
8352
9088
|
|
8353
|
-
|
9089
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8354
9090
|
GGML_ASSERT(!"not implemented");
|
8355
9091
|
|
8356
9092
|
UNUSED(backend);
|
@@ -8358,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8358
9094
|
}
|
8359
9095
|
|
8360
9096
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
8361
|
-
|
9097
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9098
|
+
|
9099
|
+
ggml_cuda_set_main_device(cuda_ctx->device);
|
8362
9100
|
|
8363
9101
|
ggml_compute_params params = {};
|
8364
9102
|
params.type = GGML_TASK_COMPUTE;
|
@@ -8366,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8366
9104
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8367
9105
|
ggml_tensor * node = cgraph->nodes[i];
|
8368
9106
|
|
8369
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
9107
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8370
9108
|
continue;
|
8371
|
-
|
9109
|
+
|
8372
9110
|
assert(node->backend == GGML_BACKEND_GPU);
|
9111
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9112
|
+
assert(node->extra != nullptr);
|
9113
|
+
|
8373
9114
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8374
9115
|
if (node->src[j] != nullptr) {
|
8375
9116
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
9117
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
9118
|
+
assert(node->src[j]->extra != nullptr);
|
8376
9119
|
}
|
8377
9120
|
}
|
8378
9121
|
|
@@ -8409,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8409
9152
|
UNUSED(backend);
|
8410
9153
|
}
|
8411
9154
|
|
9155
|
+
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
9156
|
+
switch (op->op) {
|
9157
|
+
case GGML_OP_UNARY:
|
9158
|
+
switch (ggml_get_unary_op(op)) {
|
9159
|
+
case GGML_UNARY_OP_GELU:
|
9160
|
+
case GGML_UNARY_OP_SILU:
|
9161
|
+
case GGML_UNARY_OP_RELU:
|
9162
|
+
return true;
|
9163
|
+
default:
|
9164
|
+
return false;
|
9165
|
+
}
|
9166
|
+
break;
|
9167
|
+
case GGML_OP_MUL_MAT:
|
9168
|
+
case GGML_OP_MUL_MAT_ID:
|
9169
|
+
{
|
9170
|
+
struct ggml_tensor * a;
|
9171
|
+
struct ggml_tensor * b;
|
9172
|
+
if (op->op == GGML_OP_MUL_MAT) {
|
9173
|
+
a = op->src[0];
|
9174
|
+
b = op->src[1];
|
9175
|
+
} else {
|
9176
|
+
a = op->src[2];
|
9177
|
+
b = op->src[1];
|
9178
|
+
}
|
9179
|
+
if (a->ne[3] != b->ne[3]) {
|
9180
|
+
return false;
|
9181
|
+
}
|
9182
|
+
return true;
|
9183
|
+
} break;
|
9184
|
+
case GGML_OP_NONE:
|
9185
|
+
case GGML_OP_RESHAPE:
|
9186
|
+
case GGML_OP_VIEW:
|
9187
|
+
case GGML_OP_PERMUTE:
|
9188
|
+
case GGML_OP_TRANSPOSE:
|
9189
|
+
case GGML_OP_NORM:
|
9190
|
+
case GGML_OP_REPEAT:
|
9191
|
+
case GGML_OP_GET_ROWS:
|
9192
|
+
case GGML_OP_DUP:
|
9193
|
+
case GGML_OP_ADD:
|
9194
|
+
case GGML_OP_MUL:
|
9195
|
+
case GGML_OP_DIV:
|
9196
|
+
case GGML_OP_RMS_NORM:
|
9197
|
+
case GGML_OP_SCALE:
|
9198
|
+
case GGML_OP_SQR:
|
9199
|
+
case GGML_OP_CLAMP:
|
9200
|
+
case GGML_OP_CPY:
|
9201
|
+
case GGML_OP_CONT:
|
9202
|
+
case GGML_OP_DIAG_MASK_INF:
|
9203
|
+
case GGML_OP_SOFT_MAX:
|
9204
|
+
case GGML_OP_ROPE:
|
9205
|
+
case GGML_OP_ALIBI:
|
9206
|
+
case GGML_OP_IM2COL:
|
9207
|
+
case GGML_OP_SUM_ROWS:
|
9208
|
+
case GGML_OP_ARGSORT:
|
9209
|
+
return true;
|
9210
|
+
default:
|
9211
|
+
return false;
|
9212
|
+
}
|
9213
|
+
|
9214
|
+
UNUSED(backend);
|
9215
|
+
}
|
9216
|
+
|
8412
9217
|
static ggml_backend_i cuda_backend_i = {
|
8413
|
-
/* .get_name
|
8414
|
-
/* .free
|
8415
|
-
/* .
|
8416
|
-
/* .
|
8417
|
-
/* .
|
8418
|
-
/* .
|
8419
|
-
/* .
|
8420
|
-
/* .
|
8421
|
-
/* .
|
8422
|
-
/* .
|
8423
|
-
/* .
|
8424
|
-
/* .
|
8425
|
-
/* .
|
8426
|
-
/* .supports_op = */ nullptr,
|
9218
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
9219
|
+
/* .free = */ ggml_backend_cuda_free,
|
9220
|
+
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
9221
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
9222
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
9223
|
+
/* .cpy_tensor_from_async = */ NULL,
|
9224
|
+
/* .cpy_tensor_to_async = */ NULL,
|
9225
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
9226
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
9227
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
9228
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
9229
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
9230
|
+
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
8427
9231
|
};
|
8428
9232
|
|
8429
|
-
ggml_backend_t ggml_backend_cuda_init() {
|
9233
|
+
ggml_backend_t ggml_backend_cuda_init(int device) {
|
8430
9234
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
8431
9235
|
|
8432
|
-
|
9236
|
+
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
9237
|
+
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
9238
|
+
return nullptr;
|
9239
|
+
}
|
9240
|
+
|
9241
|
+
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
9242
|
+
ggml_cuda_set_main_device(device);
|
9243
|
+
|
9244
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
9245
|
+
/* .device = */ device
|
9246
|
+
};
|
8433
9247
|
|
8434
9248
|
ggml_backend_t cuda_backend = new ggml_backend {
|
8435
9249
|
/* .interface = */ cuda_backend_i,
|
@@ -8438,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
|
|
8438
9252
|
|
8439
9253
|
return cuda_backend;
|
8440
9254
|
}
|
9255
|
+
|
9256
|
+
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
9257
|
+
return backend->iface.get_name == ggml_backend_cuda_name;
|
9258
|
+
}
|
9259
|
+
|
9260
|
+
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
9261
|
+
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
9262
|
+
return cuda_backend;
|
9263
|
+
|
9264
|
+
UNUSED(params);
|
9265
|
+
}
|
9266
|
+
|
9267
|
+
extern "C" int ggml_backend_cuda_reg_devices() {
|
9268
|
+
int device_count = ggml_cuda_get_device_count();
|
9269
|
+
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
9270
|
+
for (int i = 0; i < device_count; i++) {
|
9271
|
+
char name[128];
|
9272
|
+
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
9273
|
+
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
9274
|
+
}
|
9275
|
+
return device_count;
|
9276
|
+
}
|