llama_cpp 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +81 -162
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +13 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/ggml.c +362 -84
- data/ext/llama_cpp/src/ggml.h +8 -7
- data/ext/llama_cpp/src/llama.cpp +100 -95
- data/ext/llama_cpp/src/llama.h +16 -21
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +11 -12
- metadata +2 -2
@@ -29,6 +29,8 @@
|
|
29
29
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
30
30
|
#define cublasCreate hipblasCreate
|
31
31
|
#define cublasGemmEx hipblasGemmEx
|
32
|
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
33
|
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
32
34
|
#define cublasHandle_t hipblasHandle_t
|
33
35
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
34
36
|
#define cublasSetStream hipblasSetStream
|
@@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4326
4328
|
|
4327
4329
|
const half * x = (const half *) vx;
|
4328
4330
|
|
4329
|
-
const int row_x
|
4330
|
-
const int channel
|
4331
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
4332
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
4331
4333
|
const int channel_x = channel / channel_x_divisor;
|
4332
4334
|
|
4333
|
-
const int nrows_y
|
4335
|
+
const int nrows_y = ncols_x;
|
4334
4336
|
const int nrows_dst = nrows_x;
|
4335
|
-
const int row_dst
|
4337
|
+
const int row_dst = row_x;
|
4336
4338
|
|
4337
4339
|
const int idst = channel*nrows_dst + row_dst;
|
4338
4340
|
|
@@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4345
4347
|
break;
|
4346
4348
|
}
|
4347
4349
|
|
4348
|
-
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4349
|
-
const float xi = __half2float(x[ix]);
|
4350
|
-
|
4351
4350
|
const int row_y = col_x;
|
4352
4351
|
|
4352
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4353
4353
|
const int iy = channel*nrows_y + row_y;
|
4354
4354
|
|
4355
|
+
const float xi = __half2float(x[ix]);
|
4356
|
+
|
4355
4357
|
tmp += xi * y[iy];
|
4356
4358
|
}
|
4357
4359
|
|
@@ -5662,10 +5664,10 @@ void ggml_init_cublas() {
|
|
5662
5664
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5663
5665
|
int64_t total_vram = 0;
|
5664
5666
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5665
|
-
for (
|
5667
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5666
5668
|
cudaDeviceProp prop;
|
5667
5669
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5668
|
-
fprintf(stderr, " Device %
|
5670
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5669
5671
|
|
5670
5672
|
g_tensor_split[id] = total_vram;
|
5671
5673
|
total_vram += prop.totalGlobalMem;
|
@@ -5675,15 +5677,15 @@ void ggml_init_cublas() {
|
|
5675
5677
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5676
5678
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5677
5679
|
}
|
5678
|
-
for (
|
5680
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5679
5681
|
g_tensor_split[id] /= total_vram;
|
5680
5682
|
}
|
5681
5683
|
|
5682
|
-
for (
|
5684
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5683
5685
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
5684
5686
|
|
5685
5687
|
// create cuda streams
|
5686
|
-
for (
|
5688
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
5687
5689
|
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5688
5690
|
}
|
5689
5691
|
|
@@ -6252,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6252
6254
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6253
6255
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6254
6256
|
|
6255
|
-
GGML_ASSERT(src0_dd_i
|
6257
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
6256
6258
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6257
|
-
GGML_ASSERT(dst_dd_i
|
6258
|
-
|
6259
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
6259
6260
|
|
6260
6261
|
const int64_t ne00 = src0->ne[0];
|
6261
|
-
|
6262
6262
|
const int64_t ne10 = src1->ne[0];
|
6263
6263
|
|
6264
6264
|
const int64_t ne0 = dst->ne[0];
|
6265
|
+
|
6265
6266
|
const int64_t row_diff = row_high - row_low;
|
6266
6267
|
|
6267
6268
|
int id;
|
@@ -7013,7 +7014,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
7013
7014
|
}
|
7014
7015
|
|
7015
7016
|
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7016
|
-
GGML_ASSERT(!
|
7017
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7018
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7017
7019
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
7018
7020
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7019
7021
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -7023,11 +7025,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7023
7025
|
const int64_t ne01 = src0->ne[1];
|
7024
7026
|
const int64_t ne02 = src0->ne[2];
|
7025
7027
|
|
7026
|
-
const int64_t ne12 = src1->ne[2];
|
7027
|
-
|
7028
7028
|
const int64_t nb01 = src0->nb[1];
|
7029
7029
|
const int64_t nb02 = src0->nb[2];
|
7030
7030
|
|
7031
|
+
const int64_t ne12 = src1->ne[2];
|
7032
|
+
|
7031
7033
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7032
7034
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7033
7035
|
|
@@ -7046,6 +7048,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7046
7048
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7047
7049
|
}
|
7048
7050
|
|
7051
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7052
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7054
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7057
|
+
|
7058
|
+
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
7059
|
+
const int64_t ne01 = src0->ne[1];
|
7060
|
+
const int64_t ne02 = src0->ne[2];
|
7061
|
+
const int64_t ne03 = src0->ne[3];
|
7062
|
+
|
7063
|
+
const int64_t nb01 = src0->nb[1];
|
7064
|
+
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
7065
|
+
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
7066
|
+
|
7067
|
+
const int64_t ne10 = src1->ne[0];
|
7068
|
+
const int64_t ne11 = src1->ne[1];
|
7069
|
+
const int64_t ne12 = src1->ne[2];
|
7070
|
+
const int64_t ne13 = src1->ne[3];
|
7071
|
+
|
7072
|
+
const int64_t nb11 = src1->nb[1];
|
7073
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
7074
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7075
|
+
|
7076
|
+
const int64_t ne1 = ggml_nelements(src1);
|
7077
|
+
const int64_t ne = ggml_nelements(dst);
|
7078
|
+
|
7079
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7080
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7081
|
+
|
7082
|
+
int id;
|
7083
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
7084
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7085
|
+
|
7086
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7087
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7088
|
+
half * src0_as_f16 = (half *) src0_ddq;
|
7089
|
+
|
7090
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7091
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7092
|
+
|
7093
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7094
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7095
|
+
|
7096
|
+
// convert src1 to fp16
|
7097
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7098
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7099
|
+
|
7100
|
+
size_t src1_as = 0;
|
7101
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
7102
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7103
|
+
|
7104
|
+
size_t dst_as = 0;
|
7105
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
7106
|
+
|
7107
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
7108
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
7109
|
+
|
7110
|
+
// broadcast factors
|
7111
|
+
const int64_t r2 = ne12/ne02;
|
7112
|
+
const int64_t r3 = ne13/ne03;
|
7113
|
+
|
7114
|
+
const half alpha_f16 = 1.0f;
|
7115
|
+
const half beta_f16 = 0.0f;
|
7116
|
+
|
7117
|
+
#if 0
|
7118
|
+
// use cublasGemmEx
|
7119
|
+
{
|
7120
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7121
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7122
|
+
int i03 = i13 / r3;
|
7123
|
+
int i02 = i12 / r2;
|
7124
|
+
|
7125
|
+
CUBLAS_CHECK(
|
7126
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7127
|
+
ne01, ne11, ne10,
|
7128
|
+
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
7129
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
7130
|
+
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
7131
|
+
CUBLAS_COMPUTE_16F,
|
7132
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7133
|
+
}
|
7134
|
+
}
|
7135
|
+
}
|
7136
|
+
#else
|
7137
|
+
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
7138
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7139
|
+
// use cublasGemmStridedBatchedEx
|
7140
|
+
CUBLAS_CHECK(
|
7141
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7142
|
+
ne01, ne11, ne10,
|
7143
|
+
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7144
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
7145
|
+
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
7146
|
+
ne12*ne13,
|
7147
|
+
CUBLAS_COMPUTE_16F,
|
7148
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
|
+
} else {
|
7150
|
+
// use cublasGemmBatchedEx
|
7151
|
+
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
|
+
const int ne23 = ne12*ne13;
|
7153
|
+
|
7154
|
+
// TODO: avoid this alloc
|
7155
|
+
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
7156
|
+
|
7157
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
+
int i03 = i13 / r3;
|
7160
|
+
int i02 = i12 / r2;
|
7161
|
+
|
7162
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
+
}
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
// allocate device memory for pointers
|
7169
|
+
void ** ptrs_as = nullptr;
|
7170
|
+
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
7171
|
+
|
7172
|
+
// TODO: this does not work for some reason -- not sure why?
|
7173
|
+
//size_t ptrs_s = 0;
|
7174
|
+
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7175
|
+
|
7176
|
+
// copy pointers to device
|
7177
|
+
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
7178
|
+
|
7179
|
+
free(ptrs);
|
7180
|
+
|
7181
|
+
CUBLAS_CHECK(
|
7182
|
+
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
|
+
ne01, ne11, ne10,
|
7184
|
+
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7185
|
+
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7186
|
+
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7187
|
+
ne23,
|
7188
|
+
CUBLAS_COMPUTE_16F,
|
7189
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
|
+
|
7191
|
+
// free device memory for pointers
|
7192
|
+
CUDA_CHECK(cudaFree(ptrs_as));
|
7193
|
+
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7194
|
+
}
|
7195
|
+
#endif
|
7196
|
+
|
7197
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7198
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
7199
|
+
|
7200
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7201
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
7202
|
+
}
|
7203
|
+
|
7049
7204
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7050
7205
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7051
7206
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
@@ -7058,10 +7213,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7058
7213
|
}
|
7059
7214
|
}
|
7060
7215
|
|
7216
|
+
// debug helpers
|
7217
|
+
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
|
+
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
7219
|
+
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
7220
|
+
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
7221
|
+
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
|
+
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
|
+
|
7061
7224
|
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
|
+
// KQ single-batch
|
7062
7226
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7063
|
-
} else if (all_on_device && !ggml_is_contiguous(src0) &&
|
7227
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
|
+
// KQV single-batch
|
7064
7229
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
7231
|
+
// KQ + KQV multi-batch
|
7232
|
+
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7065
7233
|
} else if (src0->type == GGML_TYPE_F32) {
|
7066
7234
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7067
7235
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
@@ -62,6 +62,7 @@ struct ggml_metal_context {
|
|
62
62
|
GGML_METAL_DECL_KERNEL(mul);
|
63
63
|
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
64
64
|
GGML_METAL_DECL_KERNEL(scale);
|
65
|
+
GGML_METAL_DECL_KERNEL(scale_4);
|
65
66
|
GGML_METAL_DECL_KERNEL(silu);
|
66
67
|
GGML_METAL_DECL_KERNEL(relu);
|
67
68
|
GGML_METAL_DECL_KERNEL(gelu);
|
@@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
249
250
|
GGML_METAL_ADD_KERNEL(mul);
|
250
251
|
GGML_METAL_ADD_KERNEL(mul_row);
|
251
252
|
GGML_METAL_ADD_KERNEL(scale);
|
253
|
+
GGML_METAL_ADD_KERNEL(scale_4);
|
252
254
|
GGML_METAL_ADD_KERNEL(silu);
|
253
255
|
GGML_METAL_ADD_KERNEL(relu);
|
254
256
|
GGML_METAL_ADD_KERNEL(gelu);
|
@@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
347
349
|
GGML_METAL_DEL_KERNEL(mul);
|
348
350
|
GGML_METAL_DEL_KERNEL(mul_row);
|
349
351
|
GGML_METAL_DEL_KERNEL(scale);
|
352
|
+
GGML_METAL_DEL_KERNEL(scale_4);
|
350
353
|
GGML_METAL_DEL_KERNEL(silu);
|
351
354
|
GGML_METAL_DEL_KERNEL(relu);
|
352
355
|
GGML_METAL_DEL_KERNEL(gelu);
|
@@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
|
|
923
926
|
|
924
927
|
const float scale = *(const float *) src1->data;
|
925
928
|
|
926
|
-
|
929
|
+
int64_t n = ggml_nelements(dst);
|
930
|
+
|
931
|
+
if (n % 4 == 0) {
|
932
|
+
n /= 4;
|
933
|
+
[encoder setComputePipelineState:ctx->pipeline_scale_4];
|
934
|
+
} else {
|
935
|
+
[encoder setComputePipelineState:ctx->pipeline_scale];
|
936
|
+
}
|
937
|
+
|
927
938
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
928
939
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
929
940
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
930
941
|
|
931
|
-
|
932
|
-
GGML_ASSERT(n % 4 == 0);
|
933
|
-
|
934
|
-
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
942
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
935
943
|
} break;
|
936
944
|
case GGML_OP_UNARY:
|
937
945
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
@@ -125,9 +125,17 @@ kernel void kernel_mul_row(
|
|
125
125
|
}
|
126
126
|
|
127
127
|
kernel void kernel_scale(
|
128
|
+
device const float * src0,
|
129
|
+
device float * dst,
|
130
|
+
constant float & scale,
|
131
|
+
uint tpig[[thread_position_in_grid]]) {
|
132
|
+
dst[tpig] = src0[tpig] * scale;
|
133
|
+
}
|
134
|
+
|
135
|
+
kernel void kernel_scale_4(
|
128
136
|
device const float4 * src0,
|
129
137
|
device float4 * dst,
|
130
|
-
constant float
|
138
|
+
constant float & scale,
|
131
139
|
uint tpig[[thread_position_in_grid]]) {
|
132
140
|
dst[tpig] = src0[tpig] * scale;
|
133
141
|
}
|