llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
|
@@ -8,8 +8,13 @@
|
|
|
8
8
|
#include <limits>
|
|
9
9
|
#include <stdint.h>
|
|
10
10
|
#include <stdio.h>
|
|
11
|
+
#include <string>
|
|
11
12
|
#include <vector>
|
|
12
|
-
|
|
13
|
+
#include <map>
|
|
14
|
+
#include <array>
|
|
15
|
+
#include "ggml-cuda.h"
|
|
16
|
+
#include "ggml.h"
|
|
17
|
+
#include "ggml-backend-impl.h"
|
|
13
18
|
|
|
14
19
|
#if defined(GGML_USE_HIPBLAS)
|
|
15
20
|
#include <hip/hip_runtime.h>
|
|
@@ -77,6 +82,7 @@
|
|
|
77
82
|
#define cudaMemcpyKind hipMemcpyKind
|
|
78
83
|
#define cudaMemset hipMemset
|
|
79
84
|
#define cudaMemsetAsync hipMemsetAsync
|
|
85
|
+
#define cudaMemGetInfo hipMemGetInfo
|
|
80
86
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
|
81
87
|
#define cudaSetDevice hipSetDevice
|
|
82
88
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
@@ -112,10 +118,9 @@
|
|
|
112
118
|
|
|
113
119
|
#endif // defined(GGML_USE_HIPBLAS)
|
|
114
120
|
|
|
115
|
-
#
|
|
116
|
-
#include "ggml.h"
|
|
117
|
-
#include "ggml-backend-impl.h"
|
|
121
|
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
|
118
122
|
|
|
123
|
+
#define CC_PASCAL 600
|
|
119
124
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
|
120
125
|
#define CC_VOLTA 700
|
|
121
126
|
#define CC_OFFSET_AMD 1000000
|
|
@@ -183,7 +188,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
|
183
188
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
184
189
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
|
185
190
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
|
186
|
-
#elif defined(
|
|
191
|
+
#elif defined(RDNA3)
|
|
187
192
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
|
188
193
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
|
189
194
|
int tmp1;
|
|
@@ -477,6 +482,23 @@ typedef struct {
|
|
|
477
482
|
} block_q6_K;
|
|
478
483
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
|
479
484
|
|
|
485
|
+
#define QR2_XXS 8
|
|
486
|
+
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
|
487
|
+
typedef struct {
|
|
488
|
+
half d;
|
|
489
|
+
uint16_t qs[QK_K/8];
|
|
490
|
+
} block_iq2_xxs;
|
|
491
|
+
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
|
492
|
+
|
|
493
|
+
#define QR2_XS 8
|
|
494
|
+
#define QI2_XS (QK_K / (4*QR2_XS))
|
|
495
|
+
typedef struct {
|
|
496
|
+
half d;
|
|
497
|
+
uint16_t qs[QK_K/8];
|
|
498
|
+
uint8_t scales[QK_K/32];
|
|
499
|
+
} block_iq2_xs;
|
|
500
|
+
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
|
501
|
+
|
|
480
502
|
#define WARP_SIZE 32
|
|
481
503
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
482
504
|
|
|
@@ -501,6 +523,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
|
501
523
|
#define CUDA_ACC_BLOCK_SIZE 256
|
|
502
524
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
|
503
525
|
|
|
526
|
+
#define CUDA_Q8_0_NE_ALIGN 2048
|
|
527
|
+
|
|
504
528
|
// dmmv = dequantize_mul_mat_vec
|
|
505
529
|
#ifndef GGML_CUDA_DMMV_X
|
|
506
530
|
#define GGML_CUDA_DMMV_X 32
|
|
@@ -544,19 +568,16 @@ static void ggml_cuda_set_device(const int device) {
|
|
|
544
568
|
|
|
545
569
|
static int g_device_count = -1;
|
|
546
570
|
static int g_main_device = 0;
|
|
547
|
-
static float
|
|
571
|
+
static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
|
|
548
572
|
|
|
549
573
|
struct cuda_device_capabilities {
|
|
550
574
|
int cc; // compute capability
|
|
575
|
+
size_t smpb; // max. shared memory per block
|
|
551
576
|
bool vmm; // virtual memory support
|
|
552
577
|
size_t vmm_granularity; // granularity of virtual memory
|
|
553
578
|
};
|
|
554
579
|
|
|
555
|
-
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
|
|
556
|
-
|
|
557
|
-
static void * g_scratch_buffer = nullptr;
|
|
558
|
-
static size_t g_scratch_size = 0; // disabled by default
|
|
559
|
-
static size_t g_scratch_offset = 0;
|
|
580
|
+
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
|
|
560
581
|
|
|
561
582
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
|
562
583
|
|
|
@@ -585,6 +606,19 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
|
585
606
|
return a;
|
|
586
607
|
}
|
|
587
608
|
|
|
609
|
+
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
610
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
611
|
+
#pragma unroll
|
|
612
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
613
|
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
|
614
|
+
}
|
|
615
|
+
return a;
|
|
616
|
+
#else
|
|
617
|
+
(void) a;
|
|
618
|
+
bad_arch();
|
|
619
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
620
|
+
}
|
|
621
|
+
|
|
588
622
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
589
623
|
#pragma unroll
|
|
590
624
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -593,6 +627,19 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
|
593
627
|
return x;
|
|
594
628
|
}
|
|
595
629
|
|
|
630
|
+
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
631
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
632
|
+
#pragma unroll
|
|
633
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
634
|
+
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
635
|
+
}
|
|
636
|
+
return x;
|
|
637
|
+
#else
|
|
638
|
+
(void) x;
|
|
639
|
+
bad_arch();
|
|
640
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
641
|
+
}
|
|
642
|
+
|
|
596
643
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
|
597
644
|
return b;
|
|
598
645
|
GGML_UNUSED(a);
|
|
@@ -1058,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
|
1058
1105
|
#endif // GGML_CUDA_F16
|
|
1059
1106
|
}
|
|
1060
1107
|
|
|
1108
|
+
template<typename dst_t>
|
|
1109
|
+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
|
1110
|
+
|
|
1111
|
+
const int i = blockIdx.x;
|
|
1112
|
+
|
|
1113
|
+
// assume 32 threads
|
|
1114
|
+
const int tid = threadIdx.x;
|
|
1115
|
+
const int il = tid/8;
|
|
1116
|
+
const int ir = tid%8;
|
|
1117
|
+
const int ib = 8*i + ir;
|
|
1118
|
+
if (ib >= nb32) {
|
|
1119
|
+
return;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
|
1123
|
+
|
|
1124
|
+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
|
1125
|
+
const float d = __half2float(x->d);
|
|
1126
|
+
const float dm = -8*d;
|
|
1127
|
+
|
|
1128
|
+
const uint8_t * q = x->qs + 4*il;
|
|
1129
|
+
|
|
1130
|
+
for (int l = 0; l < 4; ++l) {
|
|
1131
|
+
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
|
1132
|
+
y[l+16] = d * (q[l] >> 4) + dm;
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
template<typename dst_t>
|
|
1137
|
+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
|
1138
|
+
|
|
1139
|
+
const int i = blockIdx.x;
|
|
1140
|
+
|
|
1141
|
+
// assume 32 threads
|
|
1142
|
+
const int tid = threadIdx.x;
|
|
1143
|
+
const int il = tid/8;
|
|
1144
|
+
const int ir = tid%8;
|
|
1145
|
+
const int ib = 8*i + ir;
|
|
1146
|
+
if (ib >= nb32) {
|
|
1147
|
+
return;
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
|
1151
|
+
|
|
1152
|
+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
|
1153
|
+
const float2 d = __half22float2(x->dm);
|
|
1154
|
+
|
|
1155
|
+
const uint8_t * q = x->qs + 4*il;
|
|
1156
|
+
|
|
1157
|
+
for (int l = 0; l < 4; ++l) {
|
|
1158
|
+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
|
1159
|
+
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1061
1163
|
//================================== k-quants
|
|
1062
1164
|
|
|
1063
1165
|
template<typename dst_t>
|
|
@@ -1292,6 +1394,281 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
|
1292
1394
|
#endif
|
|
1293
1395
|
}
|
|
1294
1396
|
|
|
1397
|
+
static const __device__ uint64_t iq2xxs_grid[256] = {
|
|
1398
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
|
1399
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
|
|
1400
|
+
0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
|
1401
|
+
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
|
1402
|
+
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
|
1403
|
+
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
|
1404
|
+
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
|
1405
|
+
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
|
1406
|
+
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
|
1407
|
+
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
|
1408
|
+
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
|
1409
|
+
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
|
1410
|
+
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
|
1411
|
+
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
|
1412
|
+
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
|
1413
|
+
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
|
1414
|
+
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
|
1415
|
+
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
|
1416
|
+
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
|
1417
|
+
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
|
1418
|
+
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
|
1419
|
+
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
|
1420
|
+
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
|
1421
|
+
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
|
1422
|
+
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
|
1423
|
+
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
|
1424
|
+
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
|
1425
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
|
1426
|
+
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
|
1427
|
+
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
|
1428
|
+
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
|
1429
|
+
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
|
1430
|
+
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
|
1431
|
+
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
|
1432
|
+
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
|
1433
|
+
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
|
1434
|
+
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
|
1435
|
+
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
|
1436
|
+
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
|
1437
|
+
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
|
1438
|
+
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
|
1439
|
+
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
|
1440
|
+
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
|
1441
|
+
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
|
1442
|
+
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
|
1443
|
+
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
|
1444
|
+
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
|
1445
|
+
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
|
1446
|
+
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
|
1447
|
+
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
|
1448
|
+
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
|
1449
|
+
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
|
1450
|
+
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
|
1451
|
+
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
|
1452
|
+
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
|
1453
|
+
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
|
1454
|
+
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
|
1455
|
+
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
|
1456
|
+
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
|
1457
|
+
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
|
1458
|
+
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
|
1459
|
+
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
|
1460
|
+
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
|
1461
|
+
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
|
1462
|
+
};
|
|
1463
|
+
|
|
1464
|
+
static const __device__ uint64_t iq2xs_grid[512] = {
|
|
1465
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
|
1466
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
|
1467
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
|
1468
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
|
1469
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
|
1470
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
|
1471
|
+
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
|
1472
|
+
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
|
1473
|
+
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
|
1474
|
+
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
|
1475
|
+
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
|
1476
|
+
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
|
1477
|
+
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
|
1478
|
+
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
|
1479
|
+
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
|
1480
|
+
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
|
1481
|
+
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
|
1482
|
+
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
|
1483
|
+
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
|
1484
|
+
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
|
1485
|
+
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
|
1486
|
+
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
|
1487
|
+
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
|
1488
|
+
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
|
1489
|
+
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
|
1490
|
+
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
|
1491
|
+
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
|
1492
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
|
1493
|
+
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
|
1494
|
+
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
|
1495
|
+
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
|
1496
|
+
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
|
1497
|
+
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
|
1498
|
+
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
|
1499
|
+
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
|
1500
|
+
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
|
1501
|
+
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
|
1502
|
+
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
|
1503
|
+
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
|
1504
|
+
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
|
1505
|
+
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
|
1506
|
+
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
|
1507
|
+
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
|
1508
|
+
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
|
1509
|
+
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
|
1510
|
+
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
|
1511
|
+
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
|
1512
|
+
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
|
1513
|
+
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
|
1514
|
+
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
|
1515
|
+
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
|
1516
|
+
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
|
1517
|
+
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
|
1518
|
+
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
|
1519
|
+
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
|
1520
|
+
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
|
1521
|
+
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
|
1522
|
+
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
|
1523
|
+
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
|
1524
|
+
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
|
1525
|
+
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
|
1526
|
+
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
|
1527
|
+
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
|
1528
|
+
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
|
1529
|
+
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
|
1530
|
+
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
|
1531
|
+
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
|
1532
|
+
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
|
1533
|
+
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
|
1534
|
+
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
|
1535
|
+
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
|
1536
|
+
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
|
1537
|
+
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
|
1538
|
+
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
|
1539
|
+
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
|
1540
|
+
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
|
1541
|
+
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
|
1542
|
+
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
|
1543
|
+
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
|
1544
|
+
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
|
1545
|
+
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
|
1546
|
+
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
|
1547
|
+
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
|
1548
|
+
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
|
1549
|
+
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
|
1550
|
+
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
|
1551
|
+
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
|
1552
|
+
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
|
1553
|
+
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
|
1554
|
+
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
|
1555
|
+
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
|
1556
|
+
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
|
1557
|
+
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
|
1558
|
+
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
|
1559
|
+
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
|
1560
|
+
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
|
1561
|
+
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
|
1562
|
+
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
|
1563
|
+
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
|
1564
|
+
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
|
1565
|
+
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
|
1566
|
+
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
|
1567
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
|
1568
|
+
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
|
1569
|
+
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
|
1570
|
+
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
|
1571
|
+
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
|
1572
|
+
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
|
1573
|
+
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
|
1574
|
+
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
|
1575
|
+
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
|
1576
|
+
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
|
1577
|
+
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
|
1578
|
+
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
|
1579
|
+
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
|
1580
|
+
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
|
1581
|
+
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
|
1582
|
+
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
|
1583
|
+
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
|
1584
|
+
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
|
1585
|
+
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
|
1586
|
+
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
|
1587
|
+
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
|
1588
|
+
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
|
1589
|
+
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
|
1590
|
+
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
|
1591
|
+
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
|
1592
|
+
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
|
1593
|
+
};
|
|
1594
|
+
|
|
1595
|
+
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
|
1596
|
+
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
|
1597
|
+
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
|
1598
|
+
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
|
1599
|
+
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
|
1600
|
+
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
|
1601
|
+
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
|
1602
|
+
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
|
1603
|
+
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
|
1604
|
+
};
|
|
1605
|
+
|
|
1606
|
+
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
|
1607
|
+
|
|
1608
|
+
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
|
1609
|
+
switch (type) {
|
|
1610
|
+
case GGML_TYPE_Q4_0:
|
|
1611
|
+
case GGML_TYPE_Q4_1:
|
|
1612
|
+
case GGML_TYPE_Q5_0:
|
|
1613
|
+
case GGML_TYPE_Q5_1:
|
|
1614
|
+
case GGML_TYPE_Q8_0:
|
|
1615
|
+
case GGML_TYPE_Q2_K:
|
|
1616
|
+
case GGML_TYPE_Q3_K:
|
|
1617
|
+
case GGML_TYPE_Q4_K:
|
|
1618
|
+
case GGML_TYPE_Q5_K:
|
|
1619
|
+
case GGML_TYPE_Q6_K:
|
|
1620
|
+
return true;
|
|
1621
|
+
default:
|
|
1622
|
+
return false;
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
template<typename dst_t>
|
|
1627
|
+
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
|
1628
|
+
|
|
1629
|
+
const int i = blockIdx.x;
|
|
1630
|
+
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
|
1631
|
+
|
|
1632
|
+
const int tid = threadIdx.x;
|
|
1633
|
+
#if QK_K == 256
|
|
1634
|
+
const int il = tid/8; // 0...3
|
|
1635
|
+
const int ib = tid%8; // 0...7
|
|
1636
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
1637
|
+
const uint16_t * q2 = x[i].qs + 4*ib;
|
|
1638
|
+
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
1639
|
+
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
|
|
1640
|
+
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
1641
|
+
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
|
1642
|
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
|
1643
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
1644
|
+
#else
|
|
1645
|
+
assert(false);
|
|
1646
|
+
#endif
|
|
1647
|
+
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
template<typename dst_t>
|
|
1651
|
+
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
|
1652
|
+
|
|
1653
|
+
const int i = blockIdx.x;
|
|
1654
|
+
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
|
1655
|
+
|
|
1656
|
+
const int tid = threadIdx.x;
|
|
1657
|
+
#if QK_K == 256
|
|
1658
|
+
const int il = tid/8; // 0...3
|
|
1659
|
+
const int ib = tid%8; // 0...7
|
|
1660
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
1661
|
+
const uint16_t * q2 = x[i].qs + 4*ib;
|
|
1662
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
|
1663
|
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
1664
|
+
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
|
1665
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
1666
|
+
#else
|
|
1667
|
+
assert(false);
|
|
1668
|
+
#endif
|
|
1669
|
+
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1295
1672
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
|
1296
1673
|
|
|
1297
1674
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
|
@@ -1872,14 +2249,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
|
1872
2249
|
v.y = x[ib + iqs + 1];
|
|
1873
2250
|
}
|
|
1874
2251
|
|
|
1875
|
-
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
|
1876
|
-
const float * x = (const float *) vx;
|
|
1877
|
-
|
|
1878
|
-
// automatic half -> float type cast if dfloat == float
|
|
1879
|
-
v.x = x[ib + iqs + 0];
|
|
1880
|
-
v.y = x[ib + iqs + 1];
|
|
1881
|
-
}
|
|
1882
|
-
|
|
1883
2252
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
|
1884
2253
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
|
1885
2254
|
|
|
@@ -1983,7 +2352,7 @@ static __global__ void k_get_rows_float(
|
|
|
1983
2352
|
|
|
1984
2353
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
1985
2354
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
|
1986
|
-
const int i = blockDim.x*blockIdx.x +
|
|
2355
|
+
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
|
1987
2356
|
|
|
1988
2357
|
if (i >= k) {
|
|
1989
2358
|
return;
|
|
@@ -2002,6 +2371,58 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|
|
2002
2371
|
y[iybs + iqs + y_offset] = v.y;
|
|
2003
2372
|
}
|
|
2004
2373
|
|
|
2374
|
+
template <typename src_t, typename dst_t>
|
|
2375
|
+
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
|
2376
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
2377
|
+
|
|
2378
|
+
if (i >= k) {
|
|
2379
|
+
return;
|
|
2380
|
+
}
|
|
2381
|
+
|
|
2382
|
+
const src_t * x = (src_t *) vx;
|
|
2383
|
+
|
|
2384
|
+
y[i] = x[i];
|
|
2385
|
+
}
|
|
2386
|
+
|
|
2387
|
+
template <bool need_check>
|
|
2388
|
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
|
2389
|
+
#if __CUDA_ARCH__ >= CC_PASCAL
|
|
2390
|
+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
|
2391
|
+
|
|
2392
|
+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
|
2393
|
+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
|
2394
|
+
half2 * y2 = (half2 *) (y + i0);
|
|
2395
|
+
|
|
2396
|
+
__shared__ int vals[nint];
|
|
2397
|
+
|
|
2398
|
+
#pragma unroll
|
|
2399
|
+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
|
2400
|
+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
|
2401
|
+
break;
|
|
2402
|
+
}
|
|
2403
|
+
|
|
2404
|
+
const int ix = ix0 + threadIdx.x;
|
|
2405
|
+
vals[ix] = x0[ix];
|
|
2406
|
+
}
|
|
2407
|
+
|
|
2408
|
+
#pragma unroll
|
|
2409
|
+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
|
2410
|
+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
|
2411
|
+
return;
|
|
2412
|
+
}
|
|
2413
|
+
|
|
2414
|
+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
|
2415
|
+
const half d = *b0;
|
|
2416
|
+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
|
2417
|
+
|
|
2418
|
+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
|
2419
|
+
}
|
|
2420
|
+
#else
|
|
2421
|
+
(void) vx; (void) y; (void) k;
|
|
2422
|
+
bad_arch();
|
|
2423
|
+
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2005
2426
|
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
|
2006
2427
|
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
|
2007
2428
|
|
|
@@ -3820,6 +4241,91 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
|
3820
4241
|
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
|
3821
4242
|
}
|
|
3822
4243
|
|
|
4244
|
+
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|
4245
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
4246
|
+
#if QK_K == 256
|
|
4247
|
+
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
|
4248
|
+
|
|
4249
|
+
#if QR2_XXS == 8
|
|
4250
|
+
const int ib32 = iqs;
|
|
4251
|
+
const uint16_t * q2 = bq2->qs + 4*ib32;
|
|
4252
|
+
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
4253
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
|
4254
|
+
uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
4255
|
+
int sumi = 0;
|
|
4256
|
+
for (int l = 0; l < 4; ++l) {
|
|
4257
|
+
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
4258
|
+
const uint8_t signs = ksigns_iq2xs[aux32 & 127];
|
|
4259
|
+
for (int j = 0; j < 8; ++j) {
|
|
4260
|
+
sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
4261
|
+
}
|
|
4262
|
+
q8 += 8;
|
|
4263
|
+
aux32 >>= 7;
|
|
4264
|
+
}
|
|
4265
|
+
const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
|
|
4266
|
+
return d * sumi;
|
|
4267
|
+
#else
|
|
4268
|
+
// iqs is 0...15
|
|
4269
|
+
const int ib32 = iqs/2;
|
|
4270
|
+
const int il = iqs%2;
|
|
4271
|
+
const uint16_t * q2 = bq2->qs + 4*ib32;
|
|
4272
|
+
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
4273
|
+
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
|
4274
|
+
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
|
4275
|
+
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
4276
|
+
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
|
|
4277
|
+
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
|
4278
|
+
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
|
4279
|
+
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
|
4280
|
+
int sumi1 = 0, sumi2 = 0;
|
|
4281
|
+
for (int j = 0; j < 8; ++j) {
|
|
4282
|
+
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
|
4283
|
+
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
|
4284
|
+
}
|
|
4285
|
+
return d * (sumi1 + sumi2);
|
|
4286
|
+
#endif
|
|
4287
|
+
#else
|
|
4288
|
+
assert(false);
|
|
4289
|
+
return 0.f;
|
|
4290
|
+
#endif
|
|
4291
|
+
}
|
|
4292
|
+
|
|
4293
|
+
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4294
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
4295
|
+
#if QK_K == 256
|
|
4296
|
+
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
|
4297
|
+
|
|
4298
|
+
const int ib32 = iqs;
|
|
4299
|
+
const uint16_t * q2 = bq2->qs + 4*ib32;
|
|
4300
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
|
4301
|
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
|
4302
|
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
|
4303
|
+
int sumi1 = 0;
|
|
4304
|
+
for (int l = 0; l < 2; ++l) {
|
|
4305
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
4306
|
+
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
4307
|
+
for (int j = 0; j < 8; ++j) {
|
|
4308
|
+
sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
4309
|
+
}
|
|
4310
|
+
q8 += 8;
|
|
4311
|
+
}
|
|
4312
|
+
int sumi2 = 0;
|
|
4313
|
+
for (int l = 2; l < 4; ++l) {
|
|
4314
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
4315
|
+
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
4316
|
+
for (int j = 0; j < 8; ++j) {
|
|
4317
|
+
sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
4318
|
+
}
|
|
4319
|
+
q8 += 8;
|
|
4320
|
+
}
|
|
4321
|
+
const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
|
|
4322
|
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
|
4323
|
+
#else
|
|
4324
|
+
assert(false);
|
|
4325
|
+
return 0.f;
|
|
4326
|
+
#endif
|
|
4327
|
+
}
|
|
4328
|
+
|
|
3823
4329
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
|
3824
4330
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
|
3825
4331
|
static __device__ __forceinline__ void mul_mat_q(
|
|
@@ -5201,142 +5707,300 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
|
5201
5707
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
|
5202
5708
|
}
|
|
5203
5709
|
|
|
5204
|
-
|
|
5710
|
+
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
|
5711
|
+
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
|
5712
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
5713
|
+
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
|
5714
|
+
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
|
5715
|
+
|
|
5205
5716
|
const int tid = threadIdx.x;
|
|
5206
5717
|
const int rowx = blockIdx.x;
|
|
5207
5718
|
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
|
5208
5719
|
|
|
5209
|
-
const int block_size = blockDim.x;
|
|
5720
|
+
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
|
5210
5721
|
|
|
5211
5722
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
|
5212
5723
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
|
5213
5724
|
|
|
5214
|
-
__shared__
|
|
5725
|
+
extern __shared__ half data_soft_max_f16[];
|
|
5726
|
+
half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
|
|
5727
|
+
// (shared memory) buffer to cache values between iterations:
|
|
5728
|
+
half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
|
|
5729
|
+
// if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
|
|
5730
|
+
// in that case col_smem == col_data must be enforced to avoid race conditions
|
|
5215
5731
|
|
|
5216
|
-
|
|
5732
|
+
half2 max_val = make_half2(-INFINITY, -INFINITY);
|
|
5217
5733
|
|
|
5218
|
-
|
|
5219
|
-
|
|
5220
|
-
const int
|
|
5221
|
-
|
|
5734
|
+
#pragma unroll
|
|
5735
|
+
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
5736
|
+
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
|
5737
|
+
const int col_smem = vals_smem ? col0 + tid : col_data;
|
|
5738
|
+
|
|
5739
|
+
const int ix = rowx*ncols_data + col_data;
|
|
5740
|
+
const int iy = rowy*ncols_data + col_data;
|
|
5741
|
+
|
|
5742
|
+
half2 val;
|
|
5743
|
+
if (need_check && col_data + 0 >= ncols_data) {
|
|
5744
|
+
val.x = -INFINITY;
|
|
5745
|
+
} else {
|
|
5746
|
+
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
|
5747
|
+
}
|
|
5748
|
+
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
|
5749
|
+
val.y = -INFINITY;
|
|
5750
|
+
} else {
|
|
5751
|
+
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
|
5752
|
+
}
|
|
5753
|
+
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
|
5754
|
+
vals[col_smem] = val;
|
|
5755
|
+
}
|
|
5756
|
+
max_val = __hmax2(max_val, val);
|
|
5222
5757
|
}
|
|
5223
5758
|
|
|
5224
5759
|
// find the max value in the block
|
|
5225
5760
|
max_val = warp_reduce_max(max_val);
|
|
5226
5761
|
if (block_size > WARP_SIZE) {
|
|
5227
5762
|
if (warp_id == 0) {
|
|
5228
|
-
|
|
5763
|
+
buf_iw[lane_id] = -INFINITY;
|
|
5229
5764
|
}
|
|
5230
5765
|
__syncthreads();
|
|
5231
5766
|
|
|
5232
5767
|
if (lane_id == 0) {
|
|
5233
|
-
|
|
5768
|
+
buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
|
|
5234
5769
|
}
|
|
5235
5770
|
__syncthreads();
|
|
5236
5771
|
|
|
5237
|
-
max_val =
|
|
5772
|
+
max_val = __half2half2(buf_iw[lane_id]);
|
|
5238
5773
|
max_val = warp_reduce_max(max_val);
|
|
5774
|
+
} else {
|
|
5775
|
+
max_val = __half2half2(__hmax(max_val.x, max_val.y));
|
|
5239
5776
|
}
|
|
5240
5777
|
|
|
5241
|
-
|
|
5778
|
+
half2 tmp = make_half2(0.0f, 0.0f); // partial sums
|
|
5779
|
+
|
|
5780
|
+
#pragma unroll
|
|
5781
|
+
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
5782
|
+
const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
|
|
5783
|
+
|
|
5784
|
+
if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
|
|
5785
|
+
break;
|
|
5786
|
+
}
|
|
5787
|
+
|
|
5788
|
+
const half2 val = h2exp(vals[col_smem] - max_val);
|
|
5242
5789
|
|
|
5243
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
5244
|
-
const int ix = rowx*ncols + col;
|
|
5245
|
-
const int iy = rowy*ncols + col;
|
|
5246
|
-
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
|
5247
5790
|
tmp += val;
|
|
5248
|
-
|
|
5791
|
+
vals[col_smem] = val;
|
|
5249
5792
|
}
|
|
5250
5793
|
|
|
5251
5794
|
// find the sum of exps in the block
|
|
5252
5795
|
tmp = warp_reduce_sum(tmp);
|
|
5253
5796
|
if (block_size > WARP_SIZE) {
|
|
5254
5797
|
if (warp_id == 0) {
|
|
5255
|
-
|
|
5798
|
+
buf_iw[lane_id] = 0.0f;
|
|
5256
5799
|
}
|
|
5257
5800
|
__syncthreads();
|
|
5258
5801
|
|
|
5259
5802
|
if (lane_id == 0) {
|
|
5260
|
-
|
|
5803
|
+
buf_iw[warp_id] = tmp.x + tmp.y;
|
|
5261
5804
|
}
|
|
5262
5805
|
__syncthreads();
|
|
5263
5806
|
|
|
5264
|
-
tmp =
|
|
5807
|
+
tmp = __half2half2(buf_iw[lane_id]);
|
|
5265
5808
|
tmp = warp_reduce_sum(tmp);
|
|
5809
|
+
} else {
|
|
5810
|
+
tmp = __half2half2(tmp.x + tmp.y);
|
|
5266
5811
|
}
|
|
5267
5812
|
|
|
5268
|
-
const
|
|
5269
|
-
|
|
5270
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
5271
|
-
const int i = rowx*ncols + col;
|
|
5272
|
-
dst[i] *= inv_tmp;
|
|
5273
|
-
}
|
|
5274
|
-
}
|
|
5813
|
+
const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
|
|
5275
5814
|
|
|
5276
|
-
|
|
5277
|
-
|
|
5815
|
+
#pragma unroll
|
|
5816
|
+
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
5817
|
+
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
|
5818
|
+
const int col_smem = vals_smem ? col0 + tid : col_data;
|
|
5278
5819
|
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
}
|
|
5820
|
+
const int idst = rowx*ncols_data + col_data;
|
|
5821
|
+
const half2 result = vals[col_smem] * inv_sum;
|
|
5282
5822
|
|
|
5283
|
-
|
|
5284
|
-
|
|
5823
|
+
if (need_check && col_data + 0 >= ncols_data) {
|
|
5824
|
+
return;
|
|
5825
|
+
}
|
|
5826
|
+
dst[idst] = result.x;
|
|
5285
5827
|
|
|
5286
|
-
|
|
5287
|
-
|
|
5828
|
+
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
|
5829
|
+
return;
|
|
5830
|
+
}
|
|
5288
5831
|
|
|
5289
|
-
|
|
5290
|
-
return;
|
|
5832
|
+
dst[idst + WARP_SIZE] = result.y;
|
|
5291
5833
|
}
|
|
5292
|
-
|
|
5293
|
-
|
|
5834
|
+
#else
|
|
5835
|
+
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
|
5836
|
+
bad_arch();
|
|
5837
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
5294
5838
|
}
|
|
5295
5839
|
|
|
5296
|
-
|
|
5297
|
-
|
|
5298
|
-
|
|
5299
|
-
int s0, int s1, int p0, int p1, int d0, int d1) {
|
|
5300
|
-
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
5301
|
-
if (i >= pelements) {
|
|
5302
|
-
return;
|
|
5303
|
-
}
|
|
5840
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
5841
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
|
5842
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
|
5304
5843
|
|
|
5305
|
-
const int
|
|
5306
|
-
const int
|
|
5307
|
-
const int
|
|
5308
|
-
const int ky = (i - kd) / OW;
|
|
5309
|
-
const int ix = i % OW;
|
|
5844
|
+
const int tid = threadIdx.x;
|
|
5845
|
+
const int rowx = blockIdx.x;
|
|
5846
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
|
5310
5847
|
|
|
5311
|
-
const
|
|
5312
|
-
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
|
5848
|
+
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
|
5313
5849
|
|
|
5314
|
-
const
|
|
5315
|
-
|
|
5316
|
-
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
|
5850
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
|
5851
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
|
5317
5852
|
|
|
5318
|
-
|
|
5319
|
-
|
|
5320
|
-
|
|
5321
|
-
|
|
5322
|
-
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
|
5323
|
-
}
|
|
5324
|
-
}
|
|
5853
|
+
extern __shared__ float data_soft_max_f32[];
|
|
5854
|
+
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
|
5855
|
+
// shared memory buffer to cache values between iterations:
|
|
5856
|
+
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
|
|
5325
5857
|
|
|
5326
|
-
|
|
5327
|
-
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
|
5328
|
-
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
|
5858
|
+
float max_val = -INFINITY;
|
|
5329
5859
|
|
|
5330
|
-
|
|
5860
|
+
#pragma unroll
|
|
5861
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
5862
|
+
const int col = col0 + tid;
|
|
5331
5863
|
|
|
5332
|
-
|
|
5333
|
-
|
|
5334
|
-
|
|
5864
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
5865
|
+
break;
|
|
5866
|
+
}
|
|
5335
5867
|
|
|
5336
|
-
|
|
5337
|
-
|
|
5338
|
-
|
|
5339
|
-
|
|
5868
|
+
const int ix = rowx*ncols + col;
|
|
5869
|
+
const int iy = rowy*ncols + col;
|
|
5870
|
+
|
|
5871
|
+
const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
|
|
5872
|
+
vals[col] = val;
|
|
5873
|
+
max_val = max(max_val, val);
|
|
5874
|
+
}
|
|
5875
|
+
|
|
5876
|
+
// find the max value in the block
|
|
5877
|
+
max_val = warp_reduce_max(max_val);
|
|
5878
|
+
if (block_size > WARP_SIZE) {
|
|
5879
|
+
if (warp_id == 0) {
|
|
5880
|
+
buf_iw[lane_id] = -INFINITY;
|
|
5881
|
+
}
|
|
5882
|
+
__syncthreads();
|
|
5883
|
+
|
|
5884
|
+
if (lane_id == 0) {
|
|
5885
|
+
buf_iw[warp_id] = max_val;
|
|
5886
|
+
}
|
|
5887
|
+
__syncthreads();
|
|
5888
|
+
|
|
5889
|
+
max_val = buf_iw[lane_id];
|
|
5890
|
+
max_val = warp_reduce_max(max_val);
|
|
5891
|
+
}
|
|
5892
|
+
|
|
5893
|
+
float tmp = 0.0f; // partial sum
|
|
5894
|
+
|
|
5895
|
+
#pragma unroll
|
|
5896
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
5897
|
+
const int col = col0 + tid;
|
|
5898
|
+
|
|
5899
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
5900
|
+
break;
|
|
5901
|
+
}
|
|
5902
|
+
|
|
5903
|
+
const float val = expf(vals[col] - max_val);
|
|
5904
|
+
tmp += val;
|
|
5905
|
+
vals[col] = val;
|
|
5906
|
+
}
|
|
5907
|
+
|
|
5908
|
+
// find the sum of exps in the block
|
|
5909
|
+
tmp = warp_reduce_sum(tmp);
|
|
5910
|
+
if (block_size > WARP_SIZE) {
|
|
5911
|
+
if (warp_id == 0) {
|
|
5912
|
+
buf_iw[lane_id] = 0.0f;
|
|
5913
|
+
}
|
|
5914
|
+
__syncthreads();
|
|
5915
|
+
|
|
5916
|
+
if (lane_id == 0) {
|
|
5917
|
+
buf_iw[warp_id] = tmp;
|
|
5918
|
+
}
|
|
5919
|
+
__syncthreads();
|
|
5920
|
+
|
|
5921
|
+
tmp = buf_iw[lane_id];
|
|
5922
|
+
tmp = warp_reduce_sum(tmp);
|
|
5923
|
+
}
|
|
5924
|
+
|
|
5925
|
+
const float inv_sum = 1.0f / tmp;
|
|
5926
|
+
|
|
5927
|
+
#pragma unroll
|
|
5928
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
5929
|
+
const int col = col0 + tid;
|
|
5930
|
+
|
|
5931
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
5932
|
+
return;
|
|
5933
|
+
}
|
|
5934
|
+
|
|
5935
|
+
const int idst = rowx*ncols + col;
|
|
5936
|
+
dst[idst] = vals[col] * inv_sum;
|
|
5937
|
+
}
|
|
5938
|
+
}
|
|
5939
|
+
|
|
5940
|
+
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
|
5941
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
5942
|
+
|
|
5943
|
+
if (i >= k) {
|
|
5944
|
+
return;
|
|
5945
|
+
}
|
|
5946
|
+
|
|
5947
|
+
dst[i] = scale * x[i];
|
|
5948
|
+
}
|
|
5949
|
+
|
|
5950
|
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
|
5951
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
5952
|
+
|
|
5953
|
+
if (i >= k) {
|
|
5954
|
+
return;
|
|
5955
|
+
}
|
|
5956
|
+
|
|
5957
|
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
|
5958
|
+
}
|
|
5959
|
+
|
|
5960
|
+
static __global__ void im2col_f32_f16(
|
|
5961
|
+
const float * x, half * dst,
|
|
5962
|
+
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
|
5963
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
|
5964
|
+
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
5965
|
+
if (i >= pelements) {
|
|
5966
|
+
return;
|
|
5967
|
+
}
|
|
5968
|
+
|
|
5969
|
+
const int ksize = OW * (KH > 1 ? KW : 1);
|
|
5970
|
+
const int kx = i / ksize;
|
|
5971
|
+
const int kd = kx * ksize;
|
|
5972
|
+
const int ky = (i - kd) / OW;
|
|
5973
|
+
const int ix = i % OW;
|
|
5974
|
+
|
|
5975
|
+
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
|
5976
|
+
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
|
5977
|
+
|
|
5978
|
+
const int64_t offset_dst =
|
|
5979
|
+
(blockIdx.y * OW + ix) * CHW +
|
|
5980
|
+
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
|
5981
|
+
|
|
5982
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
|
5983
|
+
dst[offset_dst] = __float2half(0.0f);
|
|
5984
|
+
} else {
|
|
5985
|
+
const int64_t offset_src = blockIdx.z * offset_delta;
|
|
5986
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
|
5987
|
+
}
|
|
5988
|
+
}
|
|
5989
|
+
|
|
5990
|
+
template<int qk, int qr, dequantize_kernel_t dq>
|
|
5991
|
+
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
|
5992
|
+
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
|
5993
|
+
|
|
5994
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5995
|
+
|
|
5996
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
|
5997
|
+
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
|
5998
|
+
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
|
5999
|
+
|
|
6000
|
+
// strides in elements
|
|
6001
|
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
|
6002
|
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
|
6003
|
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
|
5340
6004
|
const size_t s3 = nb3 / ggml_element_size(dst);
|
|
5341
6005
|
|
|
5342
6006
|
const size_t s10 = nb10 / ggml_element_size(src1);
|
|
@@ -5609,10 +6273,21 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
|
5609
6273
|
|
|
5610
6274
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
5611
6275
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
|
5612
|
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
|
6276
|
+
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
|
5613
6277
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
|
5614
6278
|
}
|
|
5615
6279
|
|
|
6280
|
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
|
6281
|
+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
|
6282
|
+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
|
6283
|
+
const bool need_check = false;
|
|
6284
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
|
6285
|
+
} else {
|
|
6286
|
+
const bool need_check = true;
|
|
6287
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
|
6288
|
+
}
|
|
6289
|
+
}
|
|
6290
|
+
|
|
5616
6291
|
template<typename dst_t>
|
|
5617
6292
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
5618
6293
|
const int nb = k / QK_K;
|
|
@@ -5633,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
|
5633
6308
|
#endif
|
|
5634
6309
|
}
|
|
5635
6310
|
|
|
6311
|
+
template<typename dst_t>
|
|
6312
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
6313
|
+
const int nb32 = k / 32;
|
|
6314
|
+
const int nb = (k + 255) / 256;
|
|
6315
|
+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
|
6316
|
+
}
|
|
6317
|
+
|
|
6318
|
+
template<typename dst_t>
|
|
6319
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
6320
|
+
const int nb32 = k / 32;
|
|
6321
|
+
const int nb = (k + 255) / 256;
|
|
6322
|
+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
|
6323
|
+
}
|
|
6324
|
+
|
|
5636
6325
|
template<typename dst_t>
|
|
5637
6326
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
5638
6327
|
const int nb = k / QK_K;
|
|
@@ -5659,17 +6348,40 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
|
5659
6348
|
#endif
|
|
5660
6349
|
}
|
|
5661
6350
|
|
|
6351
|
+
template<typename dst_t>
|
|
6352
|
+
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
6353
|
+
const int nb = k / QK_K;
|
|
6354
|
+
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
|
6355
|
+
}
|
|
6356
|
+
|
|
6357
|
+
template<typename dst_t>
|
|
6358
|
+
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
|
6359
|
+
const int nb = k / QK_K;
|
|
6360
|
+
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
|
6361
|
+
}
|
|
6362
|
+
|
|
6363
|
+
template <typename src_t, typename dst_t>
|
|
6364
|
+
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
|
6365
|
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
|
6366
|
+
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
|
6367
|
+
}
|
|
6368
|
+
|
|
5662
6369
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6370
|
+
int id;
|
|
5663
6371
|
switch (type) {
|
|
5664
6372
|
case GGML_TYPE_Q4_0:
|
|
5665
|
-
return
|
|
6373
|
+
return dequantize_row_q4_0_cuda;
|
|
5666
6374
|
case GGML_TYPE_Q4_1:
|
|
5667
|
-
return
|
|
6375
|
+
return dequantize_row_q4_1_cuda;
|
|
5668
6376
|
case GGML_TYPE_Q5_0:
|
|
5669
6377
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
5670
6378
|
case GGML_TYPE_Q5_1:
|
|
5671
6379
|
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
5672
6380
|
case GGML_TYPE_Q8_0:
|
|
6381
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
|
6382
|
+
if (g_device_caps[id].cc >= CC_PASCAL) {
|
|
6383
|
+
return dequantize_block_q8_0_f16_cuda;
|
|
6384
|
+
}
|
|
5673
6385
|
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
5674
6386
|
case GGML_TYPE_Q2_K:
|
|
5675
6387
|
return dequantize_row_q2_K_cuda;
|
|
@@ -5681,8 +6393,12 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
|
5681
6393
|
return dequantize_row_q5_K_cuda;
|
|
5682
6394
|
case GGML_TYPE_Q6_K:
|
|
5683
6395
|
return dequantize_row_q6_K_cuda;
|
|
6396
|
+
case GGML_TYPE_IQ2_XXS:
|
|
6397
|
+
return dequantize_row_iq2_xxs_cuda;
|
|
6398
|
+
case GGML_TYPE_IQ2_XS:
|
|
6399
|
+
return dequantize_row_iq2_xs_cuda;
|
|
5684
6400
|
case GGML_TYPE_F32:
|
|
5685
|
-
return
|
|
6401
|
+
return convert_unary_cuda<float>;
|
|
5686
6402
|
default:
|
|
5687
6403
|
return nullptr;
|
|
5688
6404
|
}
|
|
@@ -5691,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
|
5691
6407
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
5692
6408
|
switch (type) {
|
|
5693
6409
|
case GGML_TYPE_Q4_0:
|
|
5694
|
-
return
|
|
6410
|
+
return dequantize_row_q4_0_cuda;
|
|
5695
6411
|
case GGML_TYPE_Q4_1:
|
|
5696
|
-
return
|
|
6412
|
+
return dequantize_row_q4_1_cuda;
|
|
5697
6413
|
case GGML_TYPE_Q5_0:
|
|
5698
6414
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
5699
6415
|
case GGML_TYPE_Q5_1:
|
|
@@ -5710,8 +6426,12 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
|
5710
6426
|
return dequantize_row_q5_K_cuda;
|
|
5711
6427
|
case GGML_TYPE_Q6_K:
|
|
5712
6428
|
return dequantize_row_q6_K_cuda;
|
|
6429
|
+
case GGML_TYPE_IQ2_XXS:
|
|
6430
|
+
return dequantize_row_iq2_xxs_cuda;
|
|
6431
|
+
case GGML_TYPE_IQ2_XS:
|
|
6432
|
+
return dequantize_row_iq2_xs_cuda;
|
|
5713
6433
|
case GGML_TYPE_F16:
|
|
5714
|
-
return
|
|
6434
|
+
return convert_unary_cuda<half>;
|
|
5715
6435
|
default:
|
|
5716
6436
|
return nullptr;
|
|
5717
6437
|
}
|
|
@@ -5904,6 +6624,24 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
|
5904
6624
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
5905
6625
|
}
|
|
5906
6626
|
|
|
6627
|
+
static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
6628
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
|
6629
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
6630
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
|
6631
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
6632
|
+
mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
|
|
6633
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
6634
|
+
}
|
|
6635
|
+
|
|
6636
|
+
static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
6637
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
|
6638
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
6639
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
|
6640
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
6641
|
+
mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
|
6642
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
6643
|
+
}
|
|
6644
|
+
|
|
5907
6645
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
5908
6646
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
|
5909
6647
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
|
@@ -6543,12 +7281,90 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
|
6543
7281
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
|
6544
7282
|
}
|
|
6545
7283
|
|
|
7284
|
+
static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
|
7285
|
+
int nth = WARP_SIZE;
|
|
7286
|
+
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
|
7287
|
+
const dim3 block_dims(nth, 1, 1);
|
|
7288
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
|
7289
|
+
const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
|
|
7290
|
+
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
|
7291
|
+
if (shmem <= g_device_caps[g_main_device].smpb) {
|
|
7292
|
+
switch (ncols_x) {
|
|
7293
|
+
case 32:
|
|
7294
|
+
soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7295
|
+
break;
|
|
7296
|
+
case 64:
|
|
7297
|
+
soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7298
|
+
break;
|
|
7299
|
+
case 128:
|
|
7300
|
+
soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7301
|
+
break;
|
|
7302
|
+
case 256:
|
|
7303
|
+
soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7304
|
+
break;
|
|
7305
|
+
case 512:
|
|
7306
|
+
soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7307
|
+
break;
|
|
7308
|
+
case 1024:
|
|
7309
|
+
soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7310
|
+
break;
|
|
7311
|
+
case 2048:
|
|
7312
|
+
soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7313
|
+
break;
|
|
7314
|
+
case 4096:
|
|
7315
|
+
soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7316
|
+
break;
|
|
7317
|
+
default:
|
|
7318
|
+
soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7319
|
+
break;
|
|
7320
|
+
}
|
|
7321
|
+
} else {
|
|
7322
|
+
const size_t shmem_low = WARP_SIZE*sizeof(half);
|
|
7323
|
+
soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7324
|
+
}
|
|
7325
|
+
}
|
|
7326
|
+
|
|
6546
7327
|
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
|
6547
7328
|
int nth = WARP_SIZE;
|
|
6548
7329
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
|
6549
7330
|
const dim3 block_dims(nth, 1, 1);
|
|
6550
7331
|
const dim3 block_nums(nrows_x, 1, 1);
|
|
6551
|
-
|
|
7332
|
+
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
|
7333
|
+
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
|
7334
|
+
if (shmem < g_device_caps[g_main_device].smpb) {
|
|
7335
|
+
switch (ncols_x) {
|
|
7336
|
+
case 32:
|
|
7337
|
+
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7338
|
+
break;
|
|
7339
|
+
case 64:
|
|
7340
|
+
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7341
|
+
break;
|
|
7342
|
+
case 128:
|
|
7343
|
+
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7344
|
+
break;
|
|
7345
|
+
case 256:
|
|
7346
|
+
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7347
|
+
break;
|
|
7348
|
+
case 512:
|
|
7349
|
+
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7350
|
+
break;
|
|
7351
|
+
case 1024:
|
|
7352
|
+
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7353
|
+
break;
|
|
7354
|
+
case 2048:
|
|
7355
|
+
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7356
|
+
break;
|
|
7357
|
+
case 4096:
|
|
7358
|
+
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7359
|
+
break;
|
|
7360
|
+
default:
|
|
7361
|
+
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7362
|
+
break;
|
|
7363
|
+
}
|
|
7364
|
+
} else {
|
|
7365
|
+
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
|
7366
|
+
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
7367
|
+
}
|
|
6552
7368
|
}
|
|
6553
7369
|
|
|
6554
7370
|
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
|
@@ -6799,11 +7615,11 @@ struct cuda_pool_alloc {
|
|
|
6799
7615
|
|
|
6800
7616
|
static bool g_cublas_loaded = false;
|
|
6801
7617
|
|
|
6802
|
-
bool ggml_cublas_loaded(void) {
|
|
7618
|
+
GGML_CALL bool ggml_cublas_loaded(void) {
|
|
6803
7619
|
return g_cublas_loaded;
|
|
6804
7620
|
}
|
|
6805
7621
|
|
|
6806
|
-
void ggml_init_cublas() {
|
|
7622
|
+
GGML_CALL void ggml_init_cublas() {
|
|
6807
7623
|
static bool initialized = false;
|
|
6808
7624
|
|
|
6809
7625
|
if (!initialized) {
|
|
@@ -6856,16 +7672,18 @@ void ggml_init_cublas() {
|
|
|
6856
7672
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
|
6857
7673
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
|
6858
7674
|
|
|
6859
|
-
|
|
7675
|
+
g_default_tensor_split[id] = total_vram;
|
|
6860
7676
|
total_vram += prop.totalGlobalMem;
|
|
7677
|
+
|
|
6861
7678
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
6862
7679
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
|
6863
7680
|
#else
|
|
6864
7681
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
|
|
6865
7682
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
7683
|
+
g_device_caps[id].smpb = prop.sharedMemPerBlock;
|
|
6866
7684
|
}
|
|
6867
7685
|
for (int id = 0; id < g_device_count; ++id) {
|
|
6868
|
-
|
|
7686
|
+
g_default_tensor_split[id] /= total_vram;
|
|
6869
7687
|
}
|
|
6870
7688
|
|
|
6871
7689
|
for (int id = 0; id < g_device_count; ++id) {
|
|
@@ -6889,31 +7707,7 @@ void ggml_init_cublas() {
|
|
|
6889
7707
|
}
|
|
6890
7708
|
}
|
|
6891
7709
|
|
|
6892
|
-
void
|
|
6893
|
-
if (tensor_split == nullptr) {
|
|
6894
|
-
return;
|
|
6895
|
-
}
|
|
6896
|
-
bool all_zero = true;
|
|
6897
|
-
for (int i = 0; i < g_device_count; ++i) {
|
|
6898
|
-
if (tensor_split[i] != 0.0f) {
|
|
6899
|
-
all_zero = false;
|
|
6900
|
-
break;
|
|
6901
|
-
}
|
|
6902
|
-
}
|
|
6903
|
-
if (all_zero) {
|
|
6904
|
-
return;
|
|
6905
|
-
}
|
|
6906
|
-
float split_sum = 0.0f;
|
|
6907
|
-
for (int i = 0; i < g_device_count; ++i) {
|
|
6908
|
-
g_tensor_split[i] = split_sum;
|
|
6909
|
-
split_sum += tensor_split[i];
|
|
6910
|
-
}
|
|
6911
|
-
for (int i = 0; i < g_device_count; ++i) {
|
|
6912
|
-
g_tensor_split[i] /= split_sum;
|
|
6913
|
-
}
|
|
6914
|
-
}
|
|
6915
|
-
|
|
6916
|
-
void * ggml_cuda_host_malloc(size_t size) {
|
|
7710
|
+
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
|
6917
7711
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
|
6918
7712
|
return nullptr;
|
|
6919
7713
|
}
|
|
@@ -6931,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
|
6931
7725
|
return ptr;
|
|
6932
7726
|
}
|
|
6933
7727
|
|
|
6934
|
-
void ggml_cuda_host_free(void * ptr) {
|
|
7728
|
+
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
|
6935
7729
|
CUDA_CHECK(cudaFreeHost(ptr));
|
|
6936
7730
|
}
|
|
6937
7731
|
|
|
@@ -7364,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
|
|
|
7364
8158
|
(void) src1_ddf_i;
|
|
7365
8159
|
}
|
|
7366
8160
|
|
|
7367
|
-
static int64_t get_row_rounding(ggml_type type) {
|
|
8161
|
+
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
|
7368
8162
|
int64_t min_compute_capability = INT_MAX;
|
|
7369
8163
|
int64_t max_compute_capability = INT_MIN;
|
|
7370
8164
|
for (int id = 0; id < g_device_count; ++id) {
|
|
7371
|
-
if (
|
|
8165
|
+
if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
|
7372
8166
|
if (min_compute_capability > g_device_caps[id].cc) {
|
|
7373
8167
|
min_compute_capability = g_device_caps[id].cc;
|
|
7374
8168
|
}
|
|
@@ -7396,6 +8190,8 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
|
7396
8190
|
case GGML_TYPE_Q4_K:
|
|
7397
8191
|
case GGML_TYPE_Q5_K:
|
|
7398
8192
|
case GGML_TYPE_Q6_K:
|
|
8193
|
+
case GGML_TYPE_IQ2_XXS:
|
|
8194
|
+
case GGML_TYPE_IQ2_XS:
|
|
7399
8195
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
7400
8196
|
default:
|
|
7401
8197
|
GGML_ASSERT(false);
|
|
@@ -7416,6 +8212,8 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
|
7416
8212
|
case GGML_TYPE_Q3_K:
|
|
7417
8213
|
case GGML_TYPE_Q4_K:
|
|
7418
8214
|
case GGML_TYPE_Q5_K:
|
|
8215
|
+
case GGML_TYPE_IQ2_XXS:
|
|
8216
|
+
case GGML_TYPE_IQ2_XS:
|
|
7419
8217
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
7420
8218
|
case GGML_TYPE_Q6_K:
|
|
7421
8219
|
return 64;
|
|
@@ -7425,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
|
7425
8223
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
7426
8224
|
}
|
|
7427
8225
|
|
|
8226
|
+
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
|
8227
|
+
const int64_t nrows = ggml_nrows(tensor);
|
|
8228
|
+
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
|
8229
|
+
|
|
8230
|
+
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
|
8231
|
+
*row_low -= *row_low % rounding;
|
|
8232
|
+
|
|
8233
|
+
if (id == g_device_count - 1) {
|
|
8234
|
+
*row_high = nrows;
|
|
8235
|
+
} else {
|
|
8236
|
+
*row_high = nrows*tensor_split[id + 1];
|
|
8237
|
+
*row_high -= *row_high % rounding;
|
|
8238
|
+
}
|
|
8239
|
+
}
|
|
8240
|
+
|
|
7428
8241
|
static void ggml_cuda_op_mul_mat_vec_q(
|
|
7429
8242
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
7430
8243
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
@@ -7466,6 +8279,12 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
|
7466
8279
|
case GGML_TYPE_Q6_K:
|
|
7467
8280
|
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
7468
8281
|
break;
|
|
8282
|
+
case GGML_TYPE_IQ2_XXS:
|
|
8283
|
+
mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
8284
|
+
break;
|
|
8285
|
+
case GGML_TYPE_IQ2_XS:
|
|
8286
|
+
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
8287
|
+
break;
|
|
7469
8288
|
default:
|
|
7470
8289
|
GGML_ASSERT(false);
|
|
7471
8290
|
break;
|
|
@@ -7873,7 +8692,21 @@ static void ggml_cuda_op_soft_max(
|
|
|
7873
8692
|
float scale = 1.0f;
|
|
7874
8693
|
memcpy(&scale, dst->op_params, sizeof(float));
|
|
7875
8694
|
|
|
7876
|
-
|
|
8695
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
|
|
8696
|
+
#ifdef GGML_CUDA_F16
|
|
8697
|
+
const bool use_f16_soft_max = true;
|
|
8698
|
+
#else
|
|
8699
|
+
const bool use_f16_soft_max = false;
|
|
8700
|
+
#endif // GGML_CUDA_F16
|
|
8701
|
+
#else
|
|
8702
|
+
const bool use_f16_soft_max = false;
|
|
8703
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
|
8704
|
+
|
|
8705
|
+
if (use_f16_soft_max) {
|
|
8706
|
+
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
|
8707
|
+
} else {
|
|
8708
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
|
8709
|
+
}
|
|
7877
8710
|
|
|
7878
8711
|
(void) dst;
|
|
7879
8712
|
}
|
|
@@ -8022,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
|
8022
8855
|
peer_access_enabled = enable_peer_access;
|
|
8023
8856
|
}
|
|
8024
8857
|
|
|
8858
|
+
// FIXME: move this somewhere else
|
|
8859
|
+
struct ggml_backend_cuda_split_buffer_type_context {
|
|
8860
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
8861
|
+
};
|
|
8862
|
+
|
|
8025
8863
|
static void ggml_cuda_op_mul_mat(
|
|
8026
8864
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
|
8027
8865
|
const bool convert_src1_to_q8_1) {
|
|
@@ -8073,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
|
|
|
8073
8911
|
GGML_ASSERT(!(split && ne03 > 1));
|
|
8074
8912
|
GGML_ASSERT(!(split && ne02 < ne12));
|
|
8075
8913
|
|
|
8914
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
8915
|
+
if (split) {
|
|
8916
|
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
|
8917
|
+
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
|
8918
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
8919
|
+
tensor_split = buft_ctx->tensor_split;
|
|
8920
|
+
}
|
|
8921
|
+
|
|
8076
8922
|
struct dev_data {
|
|
8077
8923
|
cuda_pool_alloc<char> src0_dd_alloc;
|
|
8078
8924
|
cuda_pool_alloc<float> src1_ddf_alloc;
|
|
@@ -8100,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
|
|
|
8100
8946
|
// for multi GPU, get the row boundaries from tensor split
|
|
8101
8947
|
// and round to mul_mat_q tile sizes
|
|
8102
8948
|
if (split) {
|
|
8103
|
-
const int64_t rounding = get_row_rounding(src0->type);
|
|
8949
|
+
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
|
8104
8950
|
|
|
8105
8951
|
if (id != 0) {
|
|
8106
|
-
dev[id].row_low = ne01*
|
|
8952
|
+
dev[id].row_low = ne01*tensor_split[id];
|
|
8107
8953
|
if (dev[id].row_low < ne01) {
|
|
8108
8954
|
dev[id].row_low -= dev[id].row_low % rounding;
|
|
8109
8955
|
}
|
|
8110
8956
|
}
|
|
8111
8957
|
|
|
8112
8958
|
if (id != g_device_count - 1) {
|
|
8113
|
-
dev[id].row_high = ne01*
|
|
8959
|
+
dev[id].row_high = ne01*tensor_split[id + 1];
|
|
8114
8960
|
if (dev[id].row_high < ne01) {
|
|
8115
8961
|
dev[id].row_high -= dev[id].row_high % rounding;
|
|
8116
8962
|
}
|
|
@@ -8396,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
|
8396
9242
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
|
8397
9243
|
}
|
|
8398
9244
|
|
|
8399
|
-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
|
9245
|
+
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
|
8400
9246
|
if (!g_cublas_loaded) return false;
|
|
8401
9247
|
|
|
8402
9248
|
const int64_t ne10 = src1->ne[0];
|
|
@@ -8656,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
8656
9502
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
|
8657
9503
|
|
|
8658
9504
|
int64_t min_compute_capability = INT_MAX;
|
|
8659
|
-
|
|
8660
|
-
|
|
8661
|
-
|
|
9505
|
+
|
|
9506
|
+
if (split) {
|
|
9507
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
9508
|
+
auto & tensor_split = buft_ctx->tensor_split;
|
|
9509
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
9510
|
+
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
|
9511
|
+
min_compute_capability = g_device_caps[id].cc;
|
|
9512
|
+
}
|
|
8662
9513
|
}
|
|
9514
|
+
} else {
|
|
9515
|
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
|
8663
9516
|
}
|
|
8664
9517
|
|
|
8665
9518
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
@@ -8682,6 +9535,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
8682
9535
|
|
|
8683
9536
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
8684
9537
|
|
|
9538
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
|
|
9539
|
+
|
|
8685
9540
|
// debug helpers
|
|
8686
9541
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
|
8687
9542
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
|
@@ -8696,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
8696
9551
|
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
|
8697
9552
|
// KQV single-batch
|
|
8698
9553
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
|
8699
|
-
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
|
9554
|
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
|
8700
9555
|
// KQ + KQV multi-batch
|
|
8701
9556
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
|
8702
9557
|
} else if (src0->type == GGML_TYPE_F32) {
|
|
@@ -9158,299 +10013,41 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
|
9158
10013
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
|
9159
10014
|
}
|
|
9160
10015
|
|
|
9161
|
-
void
|
|
9162
|
-
|
|
10016
|
+
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
|
10017
|
+
if (main_device >= g_device_count) {
|
|
10018
|
+
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
|
10019
|
+
main_device, g_device_count, g_main_device);
|
|
10020
|
+
return;
|
|
10021
|
+
}
|
|
9163
10022
|
|
|
9164
|
-
|
|
10023
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
|
10024
|
+
g_main_device = main_device;
|
|
10025
|
+
//cudaDeviceProp prop;
|
|
10026
|
+
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
|
10027
|
+
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
|
10028
|
+
}
|
|
10029
|
+
}
|
|
9165
10030
|
|
|
9166
|
-
|
|
10031
|
+
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
|
10032
|
+
if (!g_cublas_loaded) return false;
|
|
10033
|
+
|
|
10034
|
+
ggml_cuda_func_t func;
|
|
10035
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
|
10036
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|
10037
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|
9167
10038
|
|
|
9168
|
-
|
|
9169
|
-
|
|
9170
|
-
|
|
10039
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
10040
|
+
return false;
|
|
10041
|
+
}
|
|
9171
10042
|
|
|
9172
|
-
|
|
9173
|
-
if (
|
|
9174
|
-
|
|
10043
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
10044
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
|
10045
|
+
#ifndef NDEBUG
|
|
10046
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
|
10047
|
+
#endif
|
|
10048
|
+
return false;
|
|
9175
10049
|
}
|
|
9176
|
-
|
|
9177
|
-
ggml_cuda_set_device(id);
|
|
9178
|
-
|
|
9179
|
-
int64_t row_low, row_high;
|
|
9180
|
-
if (backend == GGML_BACKEND_GPU) {
|
|
9181
|
-
row_low = 0;
|
|
9182
|
-
row_high = nrows;
|
|
9183
|
-
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
|
9184
|
-
const int64_t rounding = get_row_rounding(tensor->type);
|
|
9185
|
-
|
|
9186
|
-
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
|
9187
|
-
row_low -= row_low % rounding;
|
|
9188
|
-
|
|
9189
|
-
if (id == g_device_count - 1) {
|
|
9190
|
-
row_high = nrows;
|
|
9191
|
-
} else {
|
|
9192
|
-
row_high = nrows*g_tensor_split[id + 1];
|
|
9193
|
-
row_high -= row_high % rounding;
|
|
9194
|
-
}
|
|
9195
|
-
} else {
|
|
9196
|
-
GGML_ASSERT(false);
|
|
9197
|
-
}
|
|
9198
|
-
if (row_low == row_high) {
|
|
9199
|
-
continue;
|
|
9200
|
-
}
|
|
9201
|
-
|
|
9202
|
-
int64_t nrows_split = row_high - row_low;
|
|
9203
|
-
|
|
9204
|
-
const size_t offset_split = row_low*nb1;
|
|
9205
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
9206
|
-
const size_t original_size = size;
|
|
9207
|
-
|
|
9208
|
-
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
9209
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
9210
|
-
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
9211
|
-
}
|
|
9212
|
-
|
|
9213
|
-
char * buf;
|
|
9214
|
-
CUDA_CHECK(cudaMalloc(&buf, size));
|
|
9215
|
-
char * buf_host = (char *)data + offset_split;
|
|
9216
|
-
|
|
9217
|
-
// set padding to 0 to avoid possible NaN values
|
|
9218
|
-
if (size > original_size) {
|
|
9219
|
-
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
|
9220
|
-
}
|
|
9221
|
-
|
|
9222
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
|
9223
|
-
|
|
9224
|
-
extra->data_device[id] = buf;
|
|
9225
|
-
|
|
9226
|
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
|
9227
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
|
9228
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
|
9229
|
-
}
|
|
9230
|
-
}
|
|
9231
|
-
}
|
|
9232
|
-
|
|
9233
|
-
tensor->extra = extra;
|
|
9234
|
-
}
|
|
9235
|
-
|
|
9236
|
-
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
9237
|
-
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
|
9238
|
-
return;
|
|
9239
|
-
}
|
|
9240
|
-
|
|
9241
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
9242
|
-
|
|
9243
|
-
for (int id = 0; id < g_device_count; ++id) {
|
|
9244
|
-
ggml_cuda_set_device(id);
|
|
9245
|
-
if (extra->data_device[id] != nullptr) {
|
|
9246
|
-
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
|
9247
|
-
}
|
|
9248
|
-
|
|
9249
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
|
9250
|
-
if (extra->events[id][is] != nullptr) {
|
|
9251
|
-
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
|
9252
|
-
}
|
|
9253
|
-
}
|
|
9254
|
-
}
|
|
9255
|
-
|
|
9256
|
-
delete extra;
|
|
9257
|
-
}
|
|
9258
|
-
|
|
9259
|
-
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
|
9260
|
-
static size_t g_temp_tensor_extra_index = 0;
|
|
9261
|
-
|
|
9262
|
-
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
9263
|
-
if (g_temp_tensor_extras == nullptr) {
|
|
9264
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
|
9265
|
-
}
|
|
9266
|
-
|
|
9267
|
-
size_t alloc_index = g_temp_tensor_extra_index;
|
|
9268
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
|
9269
|
-
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
|
9270
|
-
memset(extra, 0, sizeof(*extra));
|
|
9271
|
-
|
|
9272
|
-
return extra;
|
|
9273
|
-
}
|
|
9274
|
-
|
|
9275
|
-
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
|
9276
|
-
if (scratch && g_scratch_size == 0) {
|
|
9277
|
-
return;
|
|
9278
|
-
}
|
|
9279
|
-
|
|
9280
|
-
tensor->backend = GGML_BACKEND_GPU;
|
|
9281
|
-
|
|
9282
|
-
// recursively assign CUDA buffers until a compute tensor is found
|
|
9283
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
|
9284
|
-
const ggml_op src0_op = tensor->src[0]->op;
|
|
9285
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
|
9286
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
|
9287
|
-
}
|
|
9288
|
-
}
|
|
9289
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
|
9290
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
|
9291
|
-
}
|
|
9292
|
-
|
|
9293
|
-
if (scratch && no_alloc) {
|
|
9294
|
-
return;
|
|
9295
|
-
}
|
|
9296
|
-
|
|
9297
|
-
ggml_tensor_extra_gpu * extra;
|
|
9298
|
-
|
|
9299
|
-
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
|
9300
|
-
tensor->op == GGML_OP_VIEW ||
|
|
9301
|
-
force_inplace;
|
|
9302
|
-
const size_t size = ggml_nbytes(tensor);
|
|
9303
|
-
|
|
9304
|
-
ggml_cuda_set_device(g_main_device);
|
|
9305
|
-
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
|
9306
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
|
9307
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
9308
|
-
size_t offset = 0;
|
|
9309
|
-
if (tensor->op == GGML_OP_VIEW) {
|
|
9310
|
-
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
|
9311
|
-
}
|
|
9312
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
9313
|
-
extra->data_device[g_main_device] = src0_ddc + offset;
|
|
9314
|
-
} else if (tensor->op == GGML_OP_CPY) {
|
|
9315
|
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
|
9316
|
-
void * src1_ddv = src1_extra->data_device[g_main_device];
|
|
9317
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
9318
|
-
extra->data_device[g_main_device] = src1_ddv;
|
|
9319
|
-
} else if (scratch) {
|
|
9320
|
-
GGML_ASSERT(size <= g_scratch_size);
|
|
9321
|
-
if (g_scratch_offset + size > g_scratch_size) {
|
|
9322
|
-
g_scratch_offset = 0;
|
|
9323
|
-
}
|
|
9324
|
-
|
|
9325
|
-
char * data = (char *) g_scratch_buffer;
|
|
9326
|
-
if (data == nullptr) {
|
|
9327
|
-
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
|
9328
|
-
g_scratch_buffer = data;
|
|
9329
|
-
}
|
|
9330
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
9331
|
-
extra->data_device[g_main_device] = data + g_scratch_offset;
|
|
9332
|
-
|
|
9333
|
-
g_scratch_offset += size;
|
|
9334
|
-
|
|
9335
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
|
9336
|
-
} else { // allocate new buffers outside of scratch
|
|
9337
|
-
void * data;
|
|
9338
|
-
CUDA_CHECK(cudaMalloc(&data, size));
|
|
9339
|
-
CUDA_CHECK(cudaMemset(data, 0, size));
|
|
9340
|
-
extra = new ggml_tensor_extra_gpu;
|
|
9341
|
-
memset(extra, 0, sizeof(*extra));
|
|
9342
|
-
extra->data_device[g_main_device] = data;
|
|
9343
|
-
}
|
|
9344
|
-
|
|
9345
|
-
tensor->extra = extra;
|
|
9346
|
-
}
|
|
9347
|
-
|
|
9348
|
-
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
|
9349
|
-
if (g_scratch_size == 0) {
|
|
9350
|
-
return;
|
|
9351
|
-
}
|
|
9352
|
-
if (g_scratch_buffer == nullptr) {
|
|
9353
|
-
ggml_cuda_set_device(g_main_device);
|
|
9354
|
-
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
|
9355
|
-
}
|
|
9356
|
-
|
|
9357
|
-
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
9358
|
-
|
|
9359
|
-
const bool inplace = tensor->view_src != nullptr;
|
|
9360
|
-
|
|
9361
|
-
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
|
9362
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
|
9363
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
9364
|
-
size_t view_offset = 0;
|
|
9365
|
-
if (tensor->op == GGML_OP_VIEW) {
|
|
9366
|
-
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
|
9367
|
-
}
|
|
9368
|
-
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
|
9369
|
-
} else {
|
|
9370
|
-
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
|
9371
|
-
}
|
|
9372
|
-
|
|
9373
|
-
tensor->extra = extra;
|
|
9374
|
-
}
|
|
9375
|
-
|
|
9376
|
-
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
9377
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
9378
|
-
GGML_ASSERT(ggml_is_contiguous(tensor));
|
|
9379
|
-
|
|
9380
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
9381
|
-
ggml_cuda_set_device(g_main_device);
|
|
9382
|
-
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
|
9383
|
-
}
|
|
9384
|
-
|
|
9385
|
-
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
|
9386
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
|
9387
|
-
}
|
|
9388
|
-
|
|
9389
|
-
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
|
9390
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
|
9391
|
-
}
|
|
9392
|
-
|
|
9393
|
-
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
|
9394
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
|
9395
|
-
}
|
|
9396
|
-
|
|
9397
|
-
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
|
9398
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
|
9399
|
-
}
|
|
9400
|
-
|
|
9401
|
-
void ggml_cuda_set_main_device(const int main_device) {
|
|
9402
|
-
if (main_device >= g_device_count) {
|
|
9403
|
-
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
|
9404
|
-
main_device, g_device_count, g_main_device);
|
|
9405
|
-
return;
|
|
9406
|
-
}
|
|
9407
|
-
|
|
9408
|
-
if (g_main_device != main_device && g_device_count > 1) {
|
|
9409
|
-
g_main_device = main_device;
|
|
9410
|
-
cudaDeviceProp prop;
|
|
9411
|
-
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
|
9412
|
-
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
|
9413
|
-
}
|
|
9414
|
-
}
|
|
9415
|
-
|
|
9416
|
-
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
|
9417
|
-
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
|
9418
|
-
// it still won't always work as expected, but it's better than nothing
|
|
9419
|
-
if (scratch_size > g_scratch_size) {
|
|
9420
|
-
ggml_cuda_free_scratch();
|
|
9421
|
-
}
|
|
9422
|
-
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
|
9423
|
-
}
|
|
9424
|
-
|
|
9425
|
-
void ggml_cuda_free_scratch() {
|
|
9426
|
-
if (g_scratch_buffer == nullptr) {
|
|
9427
|
-
return;
|
|
9428
|
-
}
|
|
9429
|
-
|
|
9430
|
-
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
|
9431
|
-
g_scratch_buffer = nullptr;
|
|
9432
|
-
}
|
|
9433
|
-
|
|
9434
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
|
9435
|
-
if (!g_cublas_loaded) return false;
|
|
9436
|
-
|
|
9437
|
-
ggml_cuda_func_t func;
|
|
9438
|
-
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
|
9439
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|
9440
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|
9441
|
-
|
|
9442
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
9443
|
-
return false;
|
|
9444
|
-
}
|
|
9445
|
-
|
|
9446
|
-
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
9447
|
-
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
|
9448
|
-
#ifndef NDEBUG
|
|
9449
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
|
9450
|
-
#endif
|
|
9451
|
-
return false;
|
|
9452
|
-
}
|
|
9453
|
-
}
|
|
10050
|
+
}
|
|
9454
10051
|
|
|
9455
10052
|
switch (tensor->op) {
|
|
9456
10053
|
case GGML_OP_REPEAT:
|
|
@@ -9589,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
|
9589
10186
|
return true;
|
|
9590
10187
|
}
|
|
9591
10188
|
|
|
9592
|
-
int ggml_cuda_get_device_count() {
|
|
10189
|
+
GGML_CALL int ggml_cuda_get_device_count() {
|
|
9593
10190
|
int device_count;
|
|
9594
10191
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
|
9595
10192
|
return 0;
|
|
@@ -9597,7 +10194,7 @@ int ggml_cuda_get_device_count() {
|
|
|
9597
10194
|
return device_count;
|
|
9598
10195
|
}
|
|
9599
10196
|
|
|
9600
|
-
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
|
10197
|
+
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
|
9601
10198
|
cudaDeviceProp prop;
|
|
9602
10199
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
|
9603
10200
|
snprintf(description, description_size, "%s", prop.name);
|
|
@@ -9609,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
|
9609
10206
|
|
|
9610
10207
|
#define UNUSED GGML_UNUSED
|
|
9611
10208
|
|
|
10209
|
+
struct ggml_backend_cuda_context {
|
|
10210
|
+
int device;
|
|
10211
|
+
std::string name;
|
|
10212
|
+
};
|
|
10213
|
+
|
|
9612
10214
|
// cuda buffer
|
|
9613
10215
|
|
|
9614
|
-
struct
|
|
10216
|
+
struct ggml_backend_cuda_buffer_context {
|
|
9615
10217
|
int device;
|
|
9616
10218
|
void * dev_ptr = nullptr;
|
|
9617
10219
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
|
9618
10220
|
size_t temp_tensor_extra_index = 0;
|
|
10221
|
+
std::string name;
|
|
9619
10222
|
|
|
9620
|
-
|
|
10223
|
+
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
|
10224
|
+
device(device), dev_ptr(dev_ptr),
|
|
10225
|
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
|
10226
|
+
}
|
|
9621
10227
|
|
|
9622
|
-
~
|
|
10228
|
+
~ggml_backend_cuda_buffer_context() {
|
|
9623
10229
|
delete[] temp_tensor_extras;
|
|
9624
10230
|
}
|
|
9625
10231
|
|
|
9626
10232
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
10233
|
+
// TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
|
|
9627
10234
|
if (temp_tensor_extras == nullptr) {
|
|
9628
10235
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
|
9629
10236
|
}
|
|
@@ -9637,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
|
|
|
9637
10244
|
}
|
|
9638
10245
|
};
|
|
9639
10246
|
|
|
9640
|
-
static
|
|
9641
|
-
|
|
10247
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
10248
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
10249
|
+
return ctx->name.c_str();
|
|
10250
|
+
}
|
|
10251
|
+
|
|
10252
|
+
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
|
10253
|
+
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
|
10254
|
+
}
|
|
10255
|
+
|
|
10256
|
+
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
10257
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9642
10258
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
|
9643
10259
|
delete ctx;
|
|
9644
10260
|
}
|
|
9645
10261
|
|
|
9646
|
-
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
9647
|
-
|
|
10262
|
+
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
10263
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9648
10264
|
return ctx->dev_ptr;
|
|
9649
10265
|
}
|
|
9650
10266
|
|
|
9651
|
-
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
9652
|
-
|
|
10267
|
+
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
10268
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9653
10269
|
|
|
9654
10270
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
9655
10271
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
@@ -9678,76 +10294,106 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
|
9678
10294
|
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
|
9679
10295
|
}
|
|
9680
10296
|
}
|
|
9681
|
-
|
|
9682
|
-
UNUSED(buffer);
|
|
9683
10297
|
}
|
|
9684
10298
|
|
|
9685
|
-
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
10299
|
+
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
9686
10300
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
9687
10301
|
|
|
9688
|
-
|
|
10302
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9689
10303
|
|
|
9690
10304
|
ggml_cuda_set_device(ctx->device);
|
|
9691
10305
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9692
|
-
|
|
9693
10306
|
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
|
10307
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9694
10308
|
}
|
|
9695
10309
|
|
|
9696
|
-
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
10310
|
+
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
9697
10311
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
9698
10312
|
|
|
9699
|
-
|
|
10313
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9700
10314
|
|
|
9701
10315
|
ggml_cuda_set_device(ctx->device);
|
|
9702
10316
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9703
|
-
|
|
9704
10317
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
|
10318
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9705
10319
|
}
|
|
9706
10320
|
|
|
9707
|
-
static
|
|
9708
|
-
|
|
10321
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
10322
|
+
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
10323
|
+
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
|
10324
|
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
10325
|
+
|
|
10326
|
+
ggml_cuda_set_device(src_ctx->device);
|
|
10327
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
10328
|
+
ggml_cuda_set_device(dst_ctx->device);
|
|
10329
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
10330
|
+
CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
|
|
10331
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
10332
|
+
|
|
10333
|
+
return true;
|
|
10334
|
+
}
|
|
10335
|
+
return false;
|
|
10336
|
+
}
|
|
10337
|
+
|
|
10338
|
+
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
10339
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
9709
10340
|
|
|
9710
10341
|
ggml_cuda_set_device(ctx->device);
|
|
9711
10342
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9712
|
-
|
|
9713
10343
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
|
10344
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
|
9714
10345
|
}
|
|
9715
10346
|
|
|
9716
|
-
static
|
|
10347
|
+
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
10348
|
+
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
|
9717
10349
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
|
9718
10350
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
|
9719
10351
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
|
9720
10352
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
|
9721
10353
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
|
9722
|
-
/* .
|
|
9723
|
-
/* .cpy_tensor_to = */ NULL,
|
|
10354
|
+
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
|
9724
10355
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
|
10356
|
+
/* .reset = */ NULL,
|
|
9725
10357
|
};
|
|
9726
10358
|
|
|
9727
10359
|
// cuda buffer type
|
|
10360
|
+
struct ggml_backend_cuda_buffer_type_context {
|
|
10361
|
+
int device;
|
|
10362
|
+
std::string name;
|
|
10363
|
+
};
|
|
9728
10364
|
|
|
9729
|
-
static
|
|
9730
|
-
|
|
10365
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
10366
|
+
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
9731
10367
|
|
|
9732
|
-
|
|
10368
|
+
return ctx->name.c_str();
|
|
10369
|
+
}
|
|
10370
|
+
|
|
10371
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
10372
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
10373
|
+
|
|
10374
|
+
ggml_cuda_set_device(buft_ctx->device);
|
|
9733
10375
|
|
|
9734
10376
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
|
9735
10377
|
|
|
9736
10378
|
void * dev_ptr;
|
|
9737
|
-
|
|
10379
|
+
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
|
10380
|
+
if (err != cudaSuccess) {
|
|
10381
|
+
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
|
10382
|
+
return nullptr;
|
|
10383
|
+
}
|
|
9738
10384
|
|
|
9739
|
-
|
|
10385
|
+
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
|
9740
10386
|
|
|
9741
|
-
return ggml_backend_buffer_init(buft,
|
|
10387
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
|
9742
10388
|
}
|
|
9743
10389
|
|
|
9744
|
-
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
10390
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
9745
10391
|
return 128;
|
|
9746
10392
|
|
|
9747
10393
|
UNUSED(buft);
|
|
9748
10394
|
}
|
|
9749
10395
|
|
|
9750
|
-
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
|
10396
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
9751
10397
|
int64_t row_low = 0;
|
|
9752
10398
|
int64_t row_high = ggml_nrows(tensor);
|
|
9753
10399
|
int64_t nrows_split = row_high - row_low;
|
|
@@ -9767,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
|
9767
10413
|
UNUSED(buft);
|
|
9768
10414
|
}
|
|
9769
10415
|
|
|
9770
|
-
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
9771
|
-
|
|
10416
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
10417
|
+
if (!ggml_backend_is_cuda(backend)) {
|
|
10418
|
+
return false;
|
|
10419
|
+
}
|
|
9772
10420
|
|
|
9773
|
-
|
|
10421
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
10422
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
10423
|
+
|
|
10424
|
+
return buft_ctx->device == cuda_ctx->device;
|
|
9774
10425
|
}
|
|
9775
10426
|
|
|
9776
10427
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
10428
|
+
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
|
9777
10429
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
|
9778
10430
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
|
9779
10431
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
|
9780
10432
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
9781
|
-
/* .is_host = */
|
|
10433
|
+
/* .is_host = */ NULL,
|
|
9782
10434
|
};
|
|
9783
10435
|
|
|
9784
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
9785
|
-
|
|
10436
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
10437
|
+
// FIXME: this is not thread safe
|
|
10438
|
+
if (device >= ggml_backend_cuda_get_device_count()) {
|
|
10439
|
+
return nullptr;
|
|
10440
|
+
}
|
|
10441
|
+
|
|
10442
|
+
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
|
9786
10443
|
|
|
9787
10444
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
|
9788
10445
|
|
|
@@ -9790,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
|
9790
10447
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
|
9791
10448
|
ggml_backend_cuda_buffer_types[i] = {
|
|
9792
10449
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
|
9793
|
-
/* .context = */
|
|
10450
|
+
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
|
9794
10451
|
};
|
|
9795
10452
|
}
|
|
9796
10453
|
ggml_backend_cuda_buffer_type_initialized = true;
|
|
@@ -9799,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
|
9799
10456
|
return &ggml_backend_cuda_buffer_types[device];
|
|
9800
10457
|
}
|
|
9801
10458
|
|
|
10459
|
+
// cuda split buffer
|
|
10460
|
+
|
|
10461
|
+
struct ggml_backend_cuda_split_buffer_context {
|
|
10462
|
+
~ggml_backend_cuda_split_buffer_context() {
|
|
10463
|
+
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
|
10464
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
10465
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
|
10466
|
+
if (extra->events[id][is] != nullptr) {
|
|
10467
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
|
10468
|
+
}
|
|
10469
|
+
}
|
|
10470
|
+
if (extra->data_device[id] != nullptr) {
|
|
10471
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
|
10472
|
+
}
|
|
10473
|
+
}
|
|
10474
|
+
delete extra;
|
|
10475
|
+
}
|
|
10476
|
+
}
|
|
10477
|
+
|
|
10478
|
+
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
|
10479
|
+
};
|
|
10480
|
+
|
|
10481
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
10482
|
+
return GGML_CUDA_NAME "_Split";
|
|
10483
|
+
|
|
10484
|
+
UNUSED(buffer);
|
|
10485
|
+
}
|
|
10486
|
+
|
|
10487
|
+
// unused at the moment
|
|
10488
|
+
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|
10489
|
+
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
|
10490
|
+
//}
|
|
10491
|
+
|
|
10492
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
10493
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
10494
|
+
delete ctx;
|
|
10495
|
+
}
|
|
10496
|
+
|
|
10497
|
+
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
10498
|
+
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
|
10499
|
+
return (void *)0x1000;
|
|
10500
|
+
|
|
10501
|
+
UNUSED(buffer);
|
|
10502
|
+
}
|
|
10503
|
+
|
|
10504
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
10505
|
+
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
10506
|
+
|
|
10507
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
10508
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
10509
|
+
|
|
10510
|
+
const int64_t ne0 = tensor->ne[0];
|
|
10511
|
+
|
|
10512
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
10513
|
+
|
|
10514
|
+
ctx->tensor_extras.push_back(extra);
|
|
10515
|
+
|
|
10516
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
10517
|
+
int64_t row_low, row_high;
|
|
10518
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
10519
|
+
|
|
10520
|
+
int64_t nrows_split = row_high - row_low;
|
|
10521
|
+
if (nrows_split == 0) {
|
|
10522
|
+
continue;
|
|
10523
|
+
}
|
|
10524
|
+
|
|
10525
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
10526
|
+
const size_t original_size = size;
|
|
10527
|
+
|
|
10528
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
10529
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
10530
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
10531
|
+
}
|
|
10532
|
+
|
|
10533
|
+
// FIXME: do not crash if cudaMalloc fails
|
|
10534
|
+
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
|
10535
|
+
ggml_cuda_set_device(id);
|
|
10536
|
+
char * buf;
|
|
10537
|
+
CUDA_CHECK(cudaMalloc(&buf, size));
|
|
10538
|
+
|
|
10539
|
+
// set padding to 0 to avoid possible NaN values
|
|
10540
|
+
if (size > original_size) {
|
|
10541
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
|
10542
|
+
}
|
|
10543
|
+
|
|
10544
|
+
extra->data_device[id] = buf;
|
|
10545
|
+
|
|
10546
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
|
10547
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
|
10548
|
+
}
|
|
10549
|
+
}
|
|
10550
|
+
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
|
10551
|
+
tensor->extra = extra;
|
|
10552
|
+
}
|
|
10553
|
+
|
|
10554
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
10555
|
+
// split tensors must always be set in their entirety at once
|
|
10556
|
+
GGML_ASSERT(offset == 0);
|
|
10557
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
10558
|
+
|
|
10559
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
10560
|
+
|
|
10561
|
+
const int64_t ne0 = tensor->ne[0];
|
|
10562
|
+
const size_t nb1 = tensor->nb[1];
|
|
10563
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
|
10564
|
+
|
|
10565
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
10566
|
+
int64_t row_low, row_high;
|
|
10567
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
10568
|
+
|
|
10569
|
+
int64_t nrows_split = row_high - row_low;
|
|
10570
|
+
if (nrows_split == 0) {
|
|
10571
|
+
continue;
|
|
10572
|
+
}
|
|
10573
|
+
|
|
10574
|
+
const size_t offset_split = row_low*nb1;
|
|
10575
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
10576
|
+
const size_t original_size = size;
|
|
10577
|
+
|
|
10578
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
10579
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
10580
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
10581
|
+
}
|
|
10582
|
+
|
|
10583
|
+
const char * buf_host = (const char *)data + offset_split;
|
|
10584
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
|
|
10585
|
+
}
|
|
10586
|
+
}
|
|
10587
|
+
|
|
10588
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
10589
|
+
// split tensors must always be set in their entirety at once
|
|
10590
|
+
GGML_ASSERT(offset == 0);
|
|
10591
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
10592
|
+
|
|
10593
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
10594
|
+
|
|
10595
|
+
const int64_t ne0 = tensor->ne[0];
|
|
10596
|
+
const size_t nb1 = tensor->nb[1];
|
|
10597
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
|
10598
|
+
|
|
10599
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
10600
|
+
int64_t row_low, row_high;
|
|
10601
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
|
10602
|
+
|
|
10603
|
+
int64_t nrows_split = row_high - row_low;
|
|
10604
|
+
if (nrows_split == 0) {
|
|
10605
|
+
continue;
|
|
10606
|
+
}
|
|
10607
|
+
|
|
10608
|
+
const size_t offset_split = row_low*nb1;
|
|
10609
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
10610
|
+
const size_t original_size = size;
|
|
10611
|
+
|
|
10612
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
10613
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
10614
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
10615
|
+
}
|
|
10616
|
+
|
|
10617
|
+
char * buf_host = (char *)data + offset_split;
|
|
10618
|
+
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
|
|
10619
|
+
}
|
|
10620
|
+
}
|
|
10621
|
+
|
|
10622
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
10623
|
+
UNUSED(buffer);
|
|
10624
|
+
UNUSED(value);
|
|
10625
|
+
}
|
|
10626
|
+
|
|
10627
|
+
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
10628
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
|
10629
|
+
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
|
10630
|
+
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
|
10631
|
+
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
|
10632
|
+
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
|
10633
|
+
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
|
10634
|
+
/* .cpy_tensor = */ NULL,
|
|
10635
|
+
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
|
10636
|
+
/* .reset = */ NULL,
|
|
10637
|
+
};
|
|
10638
|
+
|
|
10639
|
+
// cuda split buffer type
|
|
10640
|
+
|
|
10641
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
10642
|
+
return GGML_CUDA_NAME "_Split";
|
|
10643
|
+
|
|
10644
|
+
UNUSED(buft);
|
|
10645
|
+
}
|
|
10646
|
+
|
|
10647
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
10648
|
+
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
|
10649
|
+
// instead, we allocate them for each tensor separately in init_tensor
|
|
10650
|
+
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
|
10651
|
+
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
|
10652
|
+
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
|
10653
|
+
|
|
10654
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
|
10655
|
+
}
|
|
10656
|
+
|
|
10657
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
10658
|
+
return 128;
|
|
10659
|
+
|
|
10660
|
+
UNUSED(buft);
|
|
10661
|
+
}
|
|
10662
|
+
|
|
10663
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
10664
|
+
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
|
10665
|
+
|
|
10666
|
+
size_t total_size = 0;
|
|
10667
|
+
|
|
10668
|
+
const int64_t ne0 = tensor->ne[0];
|
|
10669
|
+
|
|
10670
|
+
for (int id = 0; id < g_device_count; ++id) {
|
|
10671
|
+
int64_t row_low, row_high;
|
|
10672
|
+
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
|
10673
|
+
|
|
10674
|
+
int64_t nrows_split = row_high - row_low;
|
|
10675
|
+
if (nrows_split == 0) {
|
|
10676
|
+
continue;
|
|
10677
|
+
}
|
|
10678
|
+
|
|
10679
|
+
total_size += ggml_nbytes_split(tensor, nrows_split);
|
|
10680
|
+
|
|
10681
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
10682
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
10683
|
+
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
10684
|
+
}
|
|
10685
|
+
}
|
|
10686
|
+
|
|
10687
|
+
return total_size;
|
|
10688
|
+
}
|
|
10689
|
+
|
|
10690
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
10691
|
+
return ggml_backend_is_cuda(backend);
|
|
10692
|
+
|
|
10693
|
+
UNUSED(buft);
|
|
10694
|
+
}
|
|
10695
|
+
|
|
10696
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
10697
|
+
return false;
|
|
10698
|
+
|
|
10699
|
+
UNUSED(buft);
|
|
10700
|
+
}
|
|
10701
|
+
|
|
10702
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
|
10703
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
|
10704
|
+
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
|
10705
|
+
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
|
10706
|
+
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
|
10707
|
+
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
|
10708
|
+
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
|
10709
|
+
};
|
|
10710
|
+
|
|
10711
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
|
10712
|
+
// FIXME: this is not thread safe
|
|
10713
|
+
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
|
10714
|
+
|
|
10715
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
|
10716
|
+
|
|
10717
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
|
10718
|
+
if (all_zero) {
|
|
10719
|
+
tensor_split_arr = g_default_tensor_split;
|
|
10720
|
+
} else {
|
|
10721
|
+
float split_sum = 0.0f;
|
|
10722
|
+
for (int i = 0; i < g_device_count; ++i) {
|
|
10723
|
+
tensor_split_arr[i] = split_sum;
|
|
10724
|
+
split_sum += tensor_split[i];
|
|
10725
|
+
}
|
|
10726
|
+
for (int i = 0; i < g_device_count; ++i) {
|
|
10727
|
+
tensor_split_arr[i] /= split_sum;
|
|
10728
|
+
}
|
|
10729
|
+
}
|
|
10730
|
+
|
|
10731
|
+
auto it = buft_map.find(tensor_split_arr);
|
|
10732
|
+
if (it != buft_map.end()) {
|
|
10733
|
+
return &it->second;
|
|
10734
|
+
}
|
|
10735
|
+
|
|
10736
|
+
struct ggml_backend_buffer_type buft {
|
|
10737
|
+
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
|
10738
|
+
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
|
10739
|
+
};
|
|
10740
|
+
|
|
10741
|
+
auto result = buft_map.emplace(tensor_split_arr, buft);
|
|
10742
|
+
return &result.first->second;
|
|
10743
|
+
}
|
|
10744
|
+
|
|
9802
10745
|
// host buffer type
|
|
9803
10746
|
|
|
9804
|
-
static
|
|
10747
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
10748
|
+
return GGML_CUDA_NAME "_Host";
|
|
10749
|
+
|
|
10750
|
+
UNUSED(buft);
|
|
10751
|
+
}
|
|
10752
|
+
|
|
10753
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
10754
|
+
return GGML_CUDA_NAME "_Host";
|
|
10755
|
+
|
|
10756
|
+
UNUSED(buffer);
|
|
10757
|
+
}
|
|
10758
|
+
|
|
10759
|
+
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
9805
10760
|
ggml_cuda_host_free(buffer->context);
|
|
9806
10761
|
}
|
|
9807
10762
|
|
|
9808
|
-
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
10763
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
9809
10764
|
void * ptr = ggml_cuda_host_malloc(size);
|
|
9810
10765
|
|
|
9811
10766
|
if (ptr == nullptr) {
|
|
@@ -9813,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
|
9813
10768
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
|
9814
10769
|
}
|
|
9815
10770
|
|
|
9816
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
|
9817
10771
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
9818
10772
|
buffer->buft = buft;
|
|
10773
|
+
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
|
9819
10774
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
|
9820
10775
|
|
|
9821
10776
|
return buffer;
|
|
9822
10777
|
}
|
|
9823
10778
|
|
|
9824
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
10779
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
9825
10780
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
|
9826
10781
|
/* .iface = */ {
|
|
10782
|
+
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
|
9827
10783
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
|
9828
10784
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
9829
10785
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
@@ -9838,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
|
9838
10794
|
|
|
9839
10795
|
// backend
|
|
9840
10796
|
|
|
9841
|
-
|
|
9842
|
-
|
|
9843
|
-
};
|
|
9844
|
-
|
|
9845
|
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
|
9846
|
-
return GGML_CUDA_NAME;
|
|
10797
|
+
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
|
10798
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9847
10799
|
|
|
9848
|
-
|
|
10800
|
+
return cuda_ctx->name.c_str();
|
|
9849
10801
|
}
|
|
9850
10802
|
|
|
9851
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
|
9852
|
-
|
|
10803
|
+
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
|
10804
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9853
10805
|
|
|
9854
10806
|
delete cuda_ctx;
|
|
9855
10807
|
delete backend;
|
|
9856
10808
|
}
|
|
9857
10809
|
|
|
9858
|
-
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
|
9859
|
-
|
|
10810
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
|
10811
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9860
10812
|
|
|
9861
10813
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
|
9862
10814
|
}
|
|
9863
10815
|
|
|
9864
|
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
9865
|
-
|
|
10816
|
+
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
10817
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9866
10818
|
|
|
9867
10819
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
9868
10820
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
@@ -9870,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
|
9870
10822
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
9871
10823
|
}
|
|
9872
10824
|
|
|
9873
|
-
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
9874
|
-
|
|
10825
|
+
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
10826
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9875
10827
|
|
|
9876
10828
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
|
9877
10829
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
@@ -9879,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
|
9879
10831
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
|
9880
10832
|
}
|
|
9881
10833
|
|
|
9882
|
-
static
|
|
9883
|
-
|
|
9884
|
-
|
|
9885
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
|
9886
|
-
|
|
9887
|
-
UNUSED(backend);
|
|
9888
|
-
}
|
|
9889
|
-
|
|
9890
|
-
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
9891
|
-
GGML_ASSERT(!"not implemented");
|
|
10834
|
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
|
10835
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9892
10836
|
|
|
9893
|
-
|
|
10837
|
+
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
10838
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
10839
|
+
return true;
|
|
10840
|
+
}
|
|
9894
10841
|
|
|
9895
|
-
|
|
9896
|
-
UNUSED(cgraph);
|
|
10842
|
+
return false;
|
|
9897
10843
|
}
|
|
9898
10844
|
|
|
9899
|
-
static void
|
|
9900
|
-
|
|
10845
|
+
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
10846
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9901
10847
|
|
|
9902
|
-
|
|
9903
|
-
UNUSED(plan);
|
|
9904
|
-
}
|
|
9905
|
-
|
|
9906
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
9907
|
-
GGML_ASSERT(!"not implemented");
|
|
10848
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
|
9908
10849
|
|
|
9909
10850
|
UNUSED(backend);
|
|
9910
|
-
UNUSED(plan);
|
|
9911
10851
|
}
|
|
9912
10852
|
|
|
9913
|
-
static
|
|
9914
|
-
|
|
10853
|
+
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
10854
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
9915
10855
|
|
|
9916
10856
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
9917
10857
|
|
|
@@ -9921,55 +10861,35 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
|
9921
10861
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
9922
10862
|
ggml_tensor * node = cgraph->nodes[i];
|
|
9923
10863
|
|
|
9924
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
|
10864
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
9925
10865
|
continue;
|
|
10866
|
+
}
|
|
9926
10867
|
|
|
9927
|
-
|
|
10868
|
+
#ifndef NDEBUG
|
|
10869
|
+
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
|
9928
10870
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
|
9929
10871
|
assert(node->extra != nullptr);
|
|
9930
10872
|
|
|
9931
10873
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
9932
10874
|
if (node->src[j] != nullptr) {
|
|
9933
|
-
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
|
10875
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
|
9934
10876
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
|
9935
10877
|
assert(node->src[j]->extra != nullptr);
|
|
9936
10878
|
}
|
|
9937
10879
|
}
|
|
10880
|
+
#endif
|
|
9938
10881
|
|
|
9939
10882
|
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
|
9940
10883
|
if (!ok) {
|
|
9941
10884
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
9942
10885
|
}
|
|
9943
10886
|
GGML_ASSERT(ok);
|
|
9944
|
-
|
|
9945
|
-
#if 0
|
|
9946
|
-
if (node->type == GGML_TYPE_F32) {
|
|
9947
|
-
cudaDeviceSynchronize();
|
|
9948
|
-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
|
9949
|
-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
|
9950
|
-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
|
9951
|
-
ggml_type_name(node->src[0]->type),
|
|
9952
|
-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
|
9953
|
-
node->src[0]->name,
|
|
9954
|
-
node->src[1] ? node->src[1]->name : "none");
|
|
9955
|
-
double sum = 0.0;
|
|
9956
|
-
double sq_sum = 0.0;
|
|
9957
|
-
for (int i = 0; i < ggml_nelements(node); i++) {
|
|
9958
|
-
printf("%f ", tmp[i]);
|
|
9959
|
-
sum += tmp[i];
|
|
9960
|
-
sq_sum += tmp[i]*tmp[i];
|
|
9961
|
-
}
|
|
9962
|
-
printf("\n");
|
|
9963
|
-
printf("sum: %f, ", sum);
|
|
9964
|
-
printf("sq_sum: %f\n", sq_sum);
|
|
9965
|
-
}
|
|
9966
|
-
#endif
|
|
9967
10887
|
}
|
|
9968
10888
|
|
|
9969
|
-
|
|
10889
|
+
return true;
|
|
9970
10890
|
}
|
|
9971
10891
|
|
|
9972
|
-
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
10892
|
+
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
9973
10893
|
switch (op->op) {
|
|
9974
10894
|
case GGML_OP_UNARY:
|
|
9975
10895
|
switch (ggml_get_unary_op(op)) {
|
|
@@ -10080,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
|
10080
11000
|
UNUSED(backend);
|
|
10081
11001
|
}
|
|
10082
11002
|
|
|
10083
|
-
static ggml_backend_i
|
|
11003
|
+
static ggml_backend_i ggml_backend_cuda_interface = {
|
|
10084
11004
|
/* .get_name = */ ggml_backend_cuda_name,
|
|
10085
11005
|
/* .free = */ ggml_backend_cuda_free,
|
|
10086
11006
|
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
|
10087
11007
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
|
10088
11008
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
|
10089
|
-
/* .
|
|
10090
|
-
/* .cpy_tensor_to_async = */ NULL,
|
|
11009
|
+
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
|
10091
11010
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
|
10092
|
-
/* .graph_plan_create = */
|
|
10093
|
-
/* .graph_plan_free = */
|
|
10094
|
-
/* .graph_plan_compute = */
|
|
11011
|
+
/* .graph_plan_create = */ NULL,
|
|
11012
|
+
/* .graph_plan_free = */ NULL,
|
|
11013
|
+
/* .graph_plan_compute = */ NULL,
|
|
10095
11014
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
10096
11015
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
|
10097
11016
|
};
|
|
10098
11017
|
|
|
10099
|
-
ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
11018
|
+
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
10100
11019
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
|
10101
11020
|
|
|
10102
11021
|
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
|
@@ -10107,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
|
10107
11026
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
|
10108
11027
|
ggml_cuda_set_main_device(device);
|
|
10109
11028
|
|
|
10110
|
-
|
|
10111
|
-
/* .device = */ device
|
|
11029
|
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
|
|
11030
|
+
/* .device = */ device,
|
|
11031
|
+
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
|
10112
11032
|
};
|
|
10113
11033
|
|
|
10114
11034
|
ggml_backend_t cuda_backend = new ggml_backend {
|
|
10115
|
-
/* .interface = */
|
|
11035
|
+
/* .interface = */ ggml_backend_cuda_interface,
|
|
10116
11036
|
/* .context = */ ctx
|
|
10117
11037
|
};
|
|
10118
11038
|
|
|
10119
11039
|
return cuda_backend;
|
|
10120
11040
|
}
|
|
10121
11041
|
|
|
10122
|
-
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
|
10123
|
-
return backend->iface.get_name == ggml_backend_cuda_name;
|
|
11042
|
+
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
|
11043
|
+
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
|
11044
|
+
}
|
|
11045
|
+
|
|
11046
|
+
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
|
11047
|
+
return ggml_cuda_get_device_count();
|
|
11048
|
+
}
|
|
11049
|
+
|
|
11050
|
+
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
|
11051
|
+
ggml_cuda_get_device_description(device, description, description_size);
|
|
11052
|
+
}
|
|
11053
|
+
|
|
11054
|
+
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
|
11055
|
+
ggml_cuda_set_device(device);
|
|
11056
|
+
|
|
11057
|
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
|
10124
11058
|
}
|
|
10125
11059
|
|
|
10126
|
-
|
|
11060
|
+
// backend registry
|
|
11061
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
|
10127
11062
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
|
10128
11063
|
return cuda_backend;
|
|
10129
11064
|
|
|
10130
11065
|
UNUSED(params);
|
|
10131
11066
|
}
|
|
10132
11067
|
|
|
10133
|
-
extern "C" int ggml_backend_cuda_reg_devices();
|
|
11068
|
+
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
|
10134
11069
|
|
|
10135
|
-
int ggml_backend_cuda_reg_devices() {
|
|
11070
|
+
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
|
10136
11071
|
int device_count = ggml_cuda_get_device_count();
|
|
10137
11072
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
|
10138
11073
|
for (int i = 0; i < device_count; i++) {
|