llama_cpp 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +81 -162
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +13 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/ggml.c +362 -84
- data/ext/llama_cpp/src/ggml.h +8 -7
- data/ext/llama_cpp/src/llama.cpp +100 -95
- data/ext/llama_cpp/src/llama.h +16 -21
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +11 -12
- metadata +2 -2
@@ -29,6 +29,8 @@
|
|
29
29
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
30
30
|
#define cublasCreate hipblasCreate
|
31
31
|
#define cublasGemmEx hipblasGemmEx
|
32
|
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
33
|
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
32
34
|
#define cublasHandle_t hipblasHandle_t
|
33
35
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
34
36
|
#define cublasSetStream hipblasSetStream
|
@@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4326
4328
|
|
4327
4329
|
const half * x = (const half *) vx;
|
4328
4330
|
|
4329
|
-
const int row_x
|
4330
|
-
const int channel
|
4331
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
4332
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
4331
4333
|
const int channel_x = channel / channel_x_divisor;
|
4332
4334
|
|
4333
|
-
const int nrows_y
|
4335
|
+
const int nrows_y = ncols_x;
|
4334
4336
|
const int nrows_dst = nrows_x;
|
4335
|
-
const int row_dst
|
4337
|
+
const int row_dst = row_x;
|
4336
4338
|
|
4337
4339
|
const int idst = channel*nrows_dst + row_dst;
|
4338
4340
|
|
@@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4345
4347
|
break;
|
4346
4348
|
}
|
4347
4349
|
|
4348
|
-
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4349
|
-
const float xi = __half2float(x[ix]);
|
4350
|
-
|
4351
4350
|
const int row_y = col_x;
|
4352
4351
|
|
4352
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4353
4353
|
const int iy = channel*nrows_y + row_y;
|
4354
4354
|
|
4355
|
+
const float xi = __half2float(x[ix]);
|
4356
|
+
|
4355
4357
|
tmp += xi * y[iy];
|
4356
4358
|
}
|
4357
4359
|
|
@@ -5662,10 +5664,10 @@ void ggml_init_cublas() {
|
|
5662
5664
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5663
5665
|
int64_t total_vram = 0;
|
5664
5666
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5665
|
-
for (
|
5667
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5666
5668
|
cudaDeviceProp prop;
|
5667
5669
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5668
|
-
fprintf(stderr, " Device %
|
5670
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5669
5671
|
|
5670
5672
|
g_tensor_split[id] = total_vram;
|
5671
5673
|
total_vram += prop.totalGlobalMem;
|
@@ -5675,15 +5677,15 @@ void ggml_init_cublas() {
|
|
5675
5677
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5676
5678
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5677
5679
|
}
|
5678
|
-
for (
|
5680
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5679
5681
|
g_tensor_split[id] /= total_vram;
|
5680
5682
|
}
|
5681
5683
|
|
5682
|
-
for (
|
5684
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5683
5685
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
5684
5686
|
|
5685
5687
|
// create cuda streams
|
5686
|
-
for (
|
5688
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
5687
5689
|
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5688
5690
|
}
|
5689
5691
|
|
@@ -6252,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6252
6254
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6253
6255
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6254
6256
|
|
6255
|
-
GGML_ASSERT(src0_dd_i
|
6257
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
6256
6258
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6257
|
-
GGML_ASSERT(dst_dd_i
|
6258
|
-
|
6259
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
6259
6260
|
|
6260
6261
|
const int64_t ne00 = src0->ne[0];
|
6261
|
-
|
6262
6262
|
const int64_t ne10 = src1->ne[0];
|
6263
6263
|
|
6264
6264
|
const int64_t ne0 = dst->ne[0];
|
6265
|
+
|
6265
6266
|
const int64_t row_diff = row_high - row_low;
|
6266
6267
|
|
6267
6268
|
int id;
|
@@ -7013,7 +7014,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
7013
7014
|
}
|
7014
7015
|
|
7015
7016
|
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7016
|
-
GGML_ASSERT(!
|
7017
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7018
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7017
7019
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
7018
7020
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7019
7021
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -7023,11 +7025,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7023
7025
|
const int64_t ne01 = src0->ne[1];
|
7024
7026
|
const int64_t ne02 = src0->ne[2];
|
7025
7027
|
|
7026
|
-
const int64_t ne12 = src1->ne[2];
|
7027
|
-
|
7028
7028
|
const int64_t nb01 = src0->nb[1];
|
7029
7029
|
const int64_t nb02 = src0->nb[2];
|
7030
7030
|
|
7031
|
+
const int64_t ne12 = src1->ne[2];
|
7032
|
+
|
7031
7033
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7032
7034
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7033
7035
|
|
@@ -7046,6 +7048,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7046
7048
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7047
7049
|
}
|
7048
7050
|
|
7051
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7052
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7054
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7057
|
+
|
7058
|
+
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
7059
|
+
const int64_t ne01 = src0->ne[1];
|
7060
|
+
const int64_t ne02 = src0->ne[2];
|
7061
|
+
const int64_t ne03 = src0->ne[3];
|
7062
|
+
|
7063
|
+
const int64_t nb01 = src0->nb[1];
|
7064
|
+
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
7065
|
+
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
7066
|
+
|
7067
|
+
const int64_t ne10 = src1->ne[0];
|
7068
|
+
const int64_t ne11 = src1->ne[1];
|
7069
|
+
const int64_t ne12 = src1->ne[2];
|
7070
|
+
const int64_t ne13 = src1->ne[3];
|
7071
|
+
|
7072
|
+
const int64_t nb11 = src1->nb[1];
|
7073
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
7074
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7075
|
+
|
7076
|
+
const int64_t ne1 = ggml_nelements(src1);
|
7077
|
+
const int64_t ne = ggml_nelements(dst);
|
7078
|
+
|
7079
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7080
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7081
|
+
|
7082
|
+
int id;
|
7083
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
7084
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7085
|
+
|
7086
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7087
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7088
|
+
half * src0_as_f16 = (half *) src0_ddq;
|
7089
|
+
|
7090
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7091
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7092
|
+
|
7093
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7094
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7095
|
+
|
7096
|
+
// convert src1 to fp16
|
7097
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7098
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7099
|
+
|
7100
|
+
size_t src1_as = 0;
|
7101
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
7102
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7103
|
+
|
7104
|
+
size_t dst_as = 0;
|
7105
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
7106
|
+
|
7107
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
7108
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
7109
|
+
|
7110
|
+
// broadcast factors
|
7111
|
+
const int64_t r2 = ne12/ne02;
|
7112
|
+
const int64_t r3 = ne13/ne03;
|
7113
|
+
|
7114
|
+
const half alpha_f16 = 1.0f;
|
7115
|
+
const half beta_f16 = 0.0f;
|
7116
|
+
|
7117
|
+
#if 0
|
7118
|
+
// use cublasGemmEx
|
7119
|
+
{
|
7120
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7121
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7122
|
+
int i03 = i13 / r3;
|
7123
|
+
int i02 = i12 / r2;
|
7124
|
+
|
7125
|
+
CUBLAS_CHECK(
|
7126
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7127
|
+
ne01, ne11, ne10,
|
7128
|
+
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
7129
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
7130
|
+
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
7131
|
+
CUBLAS_COMPUTE_16F,
|
7132
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7133
|
+
}
|
7134
|
+
}
|
7135
|
+
}
|
7136
|
+
#else
|
7137
|
+
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
7138
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7139
|
+
// use cublasGemmStridedBatchedEx
|
7140
|
+
CUBLAS_CHECK(
|
7141
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7142
|
+
ne01, ne11, ne10,
|
7143
|
+
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7144
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
7145
|
+
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
7146
|
+
ne12*ne13,
|
7147
|
+
CUBLAS_COMPUTE_16F,
|
7148
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
|
+
} else {
|
7150
|
+
// use cublasGemmBatchedEx
|
7151
|
+
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
|
+
const int ne23 = ne12*ne13;
|
7153
|
+
|
7154
|
+
// TODO: avoid this alloc
|
7155
|
+
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
7156
|
+
|
7157
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
+
int i03 = i13 / r3;
|
7160
|
+
int i02 = i12 / r2;
|
7161
|
+
|
7162
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
+
}
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
// allocate device memory for pointers
|
7169
|
+
void ** ptrs_as = nullptr;
|
7170
|
+
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
7171
|
+
|
7172
|
+
// TODO: this does not work for some reason -- not sure why?
|
7173
|
+
//size_t ptrs_s = 0;
|
7174
|
+
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7175
|
+
|
7176
|
+
// copy pointers to device
|
7177
|
+
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
7178
|
+
|
7179
|
+
free(ptrs);
|
7180
|
+
|
7181
|
+
CUBLAS_CHECK(
|
7182
|
+
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
|
+
ne01, ne11, ne10,
|
7184
|
+
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7185
|
+
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7186
|
+
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7187
|
+
ne23,
|
7188
|
+
CUBLAS_COMPUTE_16F,
|
7189
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
|
+
|
7191
|
+
// free device memory for pointers
|
7192
|
+
CUDA_CHECK(cudaFree(ptrs_as));
|
7193
|
+
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7194
|
+
}
|
7195
|
+
#endif
|
7196
|
+
|
7197
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7198
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
7199
|
+
|
7200
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7201
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
7202
|
+
}
|
7203
|
+
|
7049
7204
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7050
7205
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7051
7206
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
@@ -7058,10 +7213,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7058
7213
|
}
|
7059
7214
|
}
|
7060
7215
|
|
7216
|
+
// debug helpers
|
7217
|
+
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
|
+
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
7219
|
+
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
7220
|
+
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
7221
|
+
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
|
+
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
|
+
|
7061
7224
|
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
|
+
// KQ single-batch
|
7062
7226
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7063
|
-
} else if (all_on_device && !ggml_is_contiguous(src0) &&
|
7227
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
|
+
// KQV single-batch
|
7064
7229
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
7231
|
+
// KQ + KQV multi-batch
|
7232
|
+
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7065
7233
|
} else if (src0->type == GGML_TYPE_F32) {
|
7066
7234
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7067
7235
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
@@ -62,6 +62,7 @@ struct ggml_metal_context {
|
|
62
62
|
GGML_METAL_DECL_KERNEL(mul);
|
63
63
|
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
64
64
|
GGML_METAL_DECL_KERNEL(scale);
|
65
|
+
GGML_METAL_DECL_KERNEL(scale_4);
|
65
66
|
GGML_METAL_DECL_KERNEL(silu);
|
66
67
|
GGML_METAL_DECL_KERNEL(relu);
|
67
68
|
GGML_METAL_DECL_KERNEL(gelu);
|
@@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
249
250
|
GGML_METAL_ADD_KERNEL(mul);
|
250
251
|
GGML_METAL_ADD_KERNEL(mul_row);
|
251
252
|
GGML_METAL_ADD_KERNEL(scale);
|
253
|
+
GGML_METAL_ADD_KERNEL(scale_4);
|
252
254
|
GGML_METAL_ADD_KERNEL(silu);
|
253
255
|
GGML_METAL_ADD_KERNEL(relu);
|
254
256
|
GGML_METAL_ADD_KERNEL(gelu);
|
@@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
347
349
|
GGML_METAL_DEL_KERNEL(mul);
|
348
350
|
GGML_METAL_DEL_KERNEL(mul_row);
|
349
351
|
GGML_METAL_DEL_KERNEL(scale);
|
352
|
+
GGML_METAL_DEL_KERNEL(scale_4);
|
350
353
|
GGML_METAL_DEL_KERNEL(silu);
|
351
354
|
GGML_METAL_DEL_KERNEL(relu);
|
352
355
|
GGML_METAL_DEL_KERNEL(gelu);
|
@@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
|
|
923
926
|
|
924
927
|
const float scale = *(const float *) src1->data;
|
925
928
|
|
926
|
-
|
929
|
+
int64_t n = ggml_nelements(dst);
|
930
|
+
|
931
|
+
if (n % 4 == 0) {
|
932
|
+
n /= 4;
|
933
|
+
[encoder setComputePipelineState:ctx->pipeline_scale_4];
|
934
|
+
} else {
|
935
|
+
[encoder setComputePipelineState:ctx->pipeline_scale];
|
936
|
+
}
|
937
|
+
|
927
938
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
928
939
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
929
940
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
930
941
|
|
931
|
-
|
932
|
-
GGML_ASSERT(n % 4 == 0);
|
933
|
-
|
934
|
-
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
942
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
935
943
|
} break;
|
936
944
|
case GGML_OP_UNARY:
|
937
945
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
@@ -125,9 +125,17 @@ kernel void kernel_mul_row(
|
|
125
125
|
}
|
126
126
|
|
127
127
|
kernel void kernel_scale(
|
128
|
+
device const float * src0,
|
129
|
+
device float * dst,
|
130
|
+
constant float & scale,
|
131
|
+
uint tpig[[thread_position_in_grid]]) {
|
132
|
+
dst[tpig] = src0[tpig] * scale;
|
133
|
+
}
|
134
|
+
|
135
|
+
kernel void kernel_scale_4(
|
128
136
|
device const float4 * src0,
|
129
137
|
device float4 * dst,
|
130
|
-
constant float
|
138
|
+
constant float & scale,
|
131
139
|
uint tpig[[thread_position_in_grid]]) {
|
132
140
|
dst[tpig] = src0[tpig] * scale;
|
133
141
|
}
|