llama_cpp 0.7.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
@@ -29,6 +29,8 @@
|
|
29
29
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
30
30
|
#define cublasCreate hipblasCreate
|
31
31
|
#define cublasGemmEx hipblasGemmEx
|
32
|
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
33
|
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
32
34
|
#define cublasHandle_t hipblasHandle_t
|
33
35
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
34
36
|
#define cublasSetStream hipblasSetStream
|
@@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4326
4328
|
|
4327
4329
|
const half * x = (const half *) vx;
|
4328
4330
|
|
4329
|
-
const int row_x
|
4330
|
-
const int channel
|
4331
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
4332
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
4331
4333
|
const int channel_x = channel / channel_x_divisor;
|
4332
4334
|
|
4333
|
-
const int nrows_y
|
4335
|
+
const int nrows_y = ncols_x;
|
4334
4336
|
const int nrows_dst = nrows_x;
|
4335
|
-
const int row_dst
|
4337
|
+
const int row_dst = row_x;
|
4336
4338
|
|
4337
4339
|
const int idst = channel*nrows_dst + row_dst;
|
4338
4340
|
|
@@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4345
4347
|
break;
|
4346
4348
|
}
|
4347
4349
|
|
4348
|
-
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4349
|
-
const float xi = __half2float(x[ix]);
|
4350
|
-
|
4351
4350
|
const int row_y = col_x;
|
4352
4351
|
|
4352
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4353
4353
|
const int iy = channel*nrows_y + row_y;
|
4354
4354
|
|
4355
|
+
const float xi = __half2float(x[ix]);
|
4356
|
+
|
4355
4357
|
tmp += xi * y[iy];
|
4356
4358
|
}
|
4357
4359
|
|
@@ -5662,10 +5664,10 @@ void ggml_init_cublas() {
|
|
5662
5664
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5663
5665
|
int64_t total_vram = 0;
|
5664
5666
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5665
|
-
for (
|
5667
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5666
5668
|
cudaDeviceProp prop;
|
5667
5669
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5668
|
-
fprintf(stderr, " Device %
|
5670
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5669
5671
|
|
5670
5672
|
g_tensor_split[id] = total_vram;
|
5671
5673
|
total_vram += prop.totalGlobalMem;
|
@@ -5675,15 +5677,15 @@ void ggml_init_cublas() {
|
|
5675
5677
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5676
5678
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5677
5679
|
}
|
5678
|
-
for (
|
5680
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5679
5681
|
g_tensor_split[id] /= total_vram;
|
5680
5682
|
}
|
5681
5683
|
|
5682
|
-
for (
|
5684
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5683
5685
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
5684
5686
|
|
5685
5687
|
// create cuda streams
|
5686
|
-
for (
|
5688
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
5687
5689
|
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5688
5690
|
}
|
5689
5691
|
|
@@ -6252,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6252
6254
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6253
6255
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6254
6256
|
|
6255
|
-
GGML_ASSERT(src0_dd_i
|
6257
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
6256
6258
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6257
|
-
GGML_ASSERT(dst_dd_i
|
6258
|
-
|
6259
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
6259
6260
|
|
6260
6261
|
const int64_t ne00 = src0->ne[0];
|
6261
|
-
|
6262
6262
|
const int64_t ne10 = src1->ne[0];
|
6263
6263
|
|
6264
6264
|
const int64_t ne0 = dst->ne[0];
|
6265
|
+
|
6265
6266
|
const int64_t row_diff = row_high - row_low;
|
6266
6267
|
|
6267
6268
|
int id;
|
@@ -7013,7 +7014,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
7013
7014
|
}
|
7014
7015
|
|
7015
7016
|
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7016
|
-
GGML_ASSERT(!
|
7017
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7018
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7017
7019
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
7018
7020
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7019
7021
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -7023,11 +7025,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7023
7025
|
const int64_t ne01 = src0->ne[1];
|
7024
7026
|
const int64_t ne02 = src0->ne[2];
|
7025
7027
|
|
7026
|
-
const int64_t ne12 = src1->ne[2];
|
7027
|
-
|
7028
7028
|
const int64_t nb01 = src0->nb[1];
|
7029
7029
|
const int64_t nb02 = src0->nb[2];
|
7030
7030
|
|
7031
|
+
const int64_t ne12 = src1->ne[2];
|
7032
|
+
|
7031
7033
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7032
7034
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7033
7035
|
|
@@ -7046,6 +7048,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7046
7048
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7047
7049
|
}
|
7048
7050
|
|
7051
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7052
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7054
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7057
|
+
|
7058
|
+
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
7059
|
+
const int64_t ne01 = src0->ne[1];
|
7060
|
+
const int64_t ne02 = src0->ne[2];
|
7061
|
+
const int64_t ne03 = src0->ne[3];
|
7062
|
+
|
7063
|
+
const int64_t nb01 = src0->nb[1];
|
7064
|
+
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
7065
|
+
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
7066
|
+
|
7067
|
+
const int64_t ne10 = src1->ne[0];
|
7068
|
+
const int64_t ne11 = src1->ne[1];
|
7069
|
+
const int64_t ne12 = src1->ne[2];
|
7070
|
+
const int64_t ne13 = src1->ne[3];
|
7071
|
+
|
7072
|
+
const int64_t nb11 = src1->nb[1];
|
7073
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
7074
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7075
|
+
|
7076
|
+
const int64_t ne1 = ggml_nelements(src1);
|
7077
|
+
const int64_t ne = ggml_nelements(dst);
|
7078
|
+
|
7079
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7080
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7081
|
+
|
7082
|
+
int id;
|
7083
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
7084
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7085
|
+
|
7086
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7087
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7088
|
+
half * src0_as_f16 = (half *) src0_ddq;
|
7089
|
+
|
7090
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7091
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7092
|
+
|
7093
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7094
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7095
|
+
|
7096
|
+
// convert src1 to fp16
|
7097
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7098
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7099
|
+
|
7100
|
+
size_t src1_as = 0;
|
7101
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
7102
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7103
|
+
|
7104
|
+
size_t dst_as = 0;
|
7105
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
7106
|
+
|
7107
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
7108
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
7109
|
+
|
7110
|
+
// broadcast factors
|
7111
|
+
const int64_t r2 = ne12/ne02;
|
7112
|
+
const int64_t r3 = ne13/ne03;
|
7113
|
+
|
7114
|
+
const half alpha_f16 = 1.0f;
|
7115
|
+
const half beta_f16 = 0.0f;
|
7116
|
+
|
7117
|
+
#if 0
|
7118
|
+
// use cublasGemmEx
|
7119
|
+
{
|
7120
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7121
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7122
|
+
int i03 = i13 / r3;
|
7123
|
+
int i02 = i12 / r2;
|
7124
|
+
|
7125
|
+
CUBLAS_CHECK(
|
7126
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7127
|
+
ne01, ne11, ne10,
|
7128
|
+
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
7129
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
7130
|
+
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
7131
|
+
CUBLAS_COMPUTE_16F,
|
7132
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7133
|
+
}
|
7134
|
+
}
|
7135
|
+
}
|
7136
|
+
#else
|
7137
|
+
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
7138
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7139
|
+
// use cublasGemmStridedBatchedEx
|
7140
|
+
CUBLAS_CHECK(
|
7141
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7142
|
+
ne01, ne11, ne10,
|
7143
|
+
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7144
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
7145
|
+
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
7146
|
+
ne12*ne13,
|
7147
|
+
CUBLAS_COMPUTE_16F,
|
7148
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
|
+
} else {
|
7150
|
+
// use cublasGemmBatchedEx
|
7151
|
+
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
|
+
const int ne23 = ne12*ne13;
|
7153
|
+
|
7154
|
+
// TODO: avoid this alloc
|
7155
|
+
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
7156
|
+
|
7157
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
+
int i03 = i13 / r3;
|
7160
|
+
int i02 = i12 / r2;
|
7161
|
+
|
7162
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
+
}
|
7166
|
+
}
|
7167
|
+
|
7168
|
+
// allocate device memory for pointers
|
7169
|
+
void ** ptrs_as = nullptr;
|
7170
|
+
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
7171
|
+
|
7172
|
+
// TODO: this does not work for some reason -- not sure why?
|
7173
|
+
//size_t ptrs_s = 0;
|
7174
|
+
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7175
|
+
|
7176
|
+
// copy pointers to device
|
7177
|
+
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
7178
|
+
|
7179
|
+
free(ptrs);
|
7180
|
+
|
7181
|
+
CUBLAS_CHECK(
|
7182
|
+
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
|
+
ne01, ne11, ne10,
|
7184
|
+
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7185
|
+
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7186
|
+
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7187
|
+
ne23,
|
7188
|
+
CUBLAS_COMPUTE_16F,
|
7189
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
|
+
|
7191
|
+
// free device memory for pointers
|
7192
|
+
CUDA_CHECK(cudaFree(ptrs_as));
|
7193
|
+
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7194
|
+
}
|
7195
|
+
#endif
|
7196
|
+
|
7197
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7198
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
7199
|
+
|
7200
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7201
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
7202
|
+
}
|
7203
|
+
|
7049
7204
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7050
7205
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7051
7206
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
@@ -7058,10 +7213,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7058
7213
|
}
|
7059
7214
|
}
|
7060
7215
|
|
7216
|
+
// debug helpers
|
7217
|
+
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
|
+
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
7219
|
+
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
7220
|
+
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
7221
|
+
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
|
+
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
|
+
|
7061
7224
|
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
|
+
// KQ single-batch
|
7062
7226
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7063
|
-
} else if (all_on_device && !ggml_is_contiguous(src0) &&
|
7227
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
|
+
// KQV single-batch
|
7064
7229
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
7231
|
+
// KQ + KQV multi-batch
|
7232
|
+
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7065
7233
|
} else if (src0->type == GGML_TYPE_F32) {
|
7066
7234
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7067
7235
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
@@ -62,6 +62,7 @@ struct ggml_metal_context {
|
|
62
62
|
GGML_METAL_DECL_KERNEL(mul);
|
63
63
|
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
64
64
|
GGML_METAL_DECL_KERNEL(scale);
|
65
|
+
GGML_METAL_DECL_KERNEL(scale_4);
|
65
66
|
GGML_METAL_DECL_KERNEL(silu);
|
66
67
|
GGML_METAL_DECL_KERNEL(relu);
|
67
68
|
GGML_METAL_DECL_KERNEL(gelu);
|
@@ -73,6 +74,8 @@ struct ggml_metal_context {
|
|
73
74
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
74
75
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
75
76
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
77
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_0);
|
78
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_1);
|
76
79
|
GGML_METAL_DECL_KERNEL(get_rows_q8_0);
|
77
80
|
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
78
81
|
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
@@ -87,6 +90,8 @@ struct ggml_metal_context {
|
|
87
90
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
88
91
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
89
92
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
93
|
+
GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
|
94
|
+
GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
|
90
95
|
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
91
96
|
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
92
97
|
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
@@ -97,6 +102,8 @@ struct ggml_metal_context {
|
|
97
102
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
98
103
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
99
104
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
105
|
+
GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
|
106
|
+
GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
|
100
107
|
GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
|
101
108
|
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
|
102
109
|
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
|
@@ -243,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
243
250
|
GGML_METAL_ADD_KERNEL(mul);
|
244
251
|
GGML_METAL_ADD_KERNEL(mul_row);
|
245
252
|
GGML_METAL_ADD_KERNEL(scale);
|
253
|
+
GGML_METAL_ADD_KERNEL(scale_4);
|
246
254
|
GGML_METAL_ADD_KERNEL(silu);
|
247
255
|
GGML_METAL_ADD_KERNEL(relu);
|
248
256
|
GGML_METAL_ADD_KERNEL(gelu);
|
@@ -254,6 +262,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
254
262
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
255
263
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
256
264
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
265
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_0);
|
266
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_1);
|
257
267
|
GGML_METAL_ADD_KERNEL(get_rows_q8_0);
|
258
268
|
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
259
269
|
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
@@ -268,6 +278,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
268
278
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
269
279
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
270
280
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
281
|
+
GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
|
282
|
+
GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
|
271
283
|
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
272
284
|
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
273
285
|
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
@@ -278,8 +290,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
278
290
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
279
291
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
280
292
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
281
|
-
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
282
293
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
294
|
+
GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
|
295
|
+
GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
|
296
|
+
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
283
297
|
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
284
298
|
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
285
299
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
@@ -335,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
335
349
|
GGML_METAL_DEL_KERNEL(mul);
|
336
350
|
GGML_METAL_DEL_KERNEL(mul_row);
|
337
351
|
GGML_METAL_DEL_KERNEL(scale);
|
352
|
+
GGML_METAL_DEL_KERNEL(scale_4);
|
338
353
|
GGML_METAL_DEL_KERNEL(silu);
|
339
354
|
GGML_METAL_DEL_KERNEL(relu);
|
340
355
|
GGML_METAL_DEL_KERNEL(gelu);
|
@@ -346,6 +361,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
346
361
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
347
362
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
348
363
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
364
|
+
GGML_METAL_DEL_KERNEL(get_rows_q5_0);
|
365
|
+
GGML_METAL_DEL_KERNEL(get_rows_q5_1);
|
349
366
|
GGML_METAL_DEL_KERNEL(get_rows_q8_0);
|
350
367
|
GGML_METAL_DEL_KERNEL(get_rows_q2_K);
|
351
368
|
GGML_METAL_DEL_KERNEL(get_rows_q3_K);
|
@@ -360,6 +377,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
360
377
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
361
378
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
362
379
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
380
|
+
GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
|
381
|
+
GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
|
363
382
|
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
364
383
|
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
365
384
|
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
@@ -370,8 +389,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
370
389
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
371
390
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
372
391
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
373
|
-
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
374
392
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
393
|
+
GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
|
394
|
+
GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
|
395
|
+
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
375
396
|
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
376
397
|
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
377
398
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
@@ -905,15 +926,20 @@ void ggml_metal_graph_compute(
|
|
905
926
|
|
906
927
|
const float scale = *(const float *) src1->data;
|
907
928
|
|
908
|
-
|
929
|
+
int64_t n = ggml_nelements(dst);
|
930
|
+
|
931
|
+
if (n % 4 == 0) {
|
932
|
+
n /= 4;
|
933
|
+
[encoder setComputePipelineState:ctx->pipeline_scale_4];
|
934
|
+
} else {
|
935
|
+
[encoder setComputePipelineState:ctx->pipeline_scale];
|
936
|
+
}
|
937
|
+
|
909
938
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
910
939
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
911
940
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
912
941
|
|
913
|
-
|
914
|
-
GGML_ASSERT(n % 4 == 0);
|
915
|
-
|
916
|
-
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
942
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
917
943
|
} break;
|
918
944
|
case GGML_OP_UNARY:
|
919
945
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
@@ -1052,6 +1078,8 @@ void ggml_metal_graph_compute(
|
|
1052
1078
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
1053
1079
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
1054
1080
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
1081
|
+
case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
|
1082
|
+
case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
|
1055
1083
|
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
|
1056
1084
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
1057
1085
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
@@ -1121,6 +1149,24 @@ void ggml_metal_graph_compute(
|
|
1121
1149
|
nth1 = 8;
|
1122
1150
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
1123
1151
|
} break;
|
1152
|
+
case GGML_TYPE_Q5_0:
|
1153
|
+
{
|
1154
|
+
GGML_ASSERT(ne02 == 1);
|
1155
|
+
GGML_ASSERT(ne12 == 1);
|
1156
|
+
|
1157
|
+
nth0 = 8;
|
1158
|
+
nth1 = 8;
|
1159
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
|
1160
|
+
} break;
|
1161
|
+
case GGML_TYPE_Q5_1:
|
1162
|
+
{
|
1163
|
+
GGML_ASSERT(ne02 == 1);
|
1164
|
+
GGML_ASSERT(ne12 == 1);
|
1165
|
+
|
1166
|
+
nth0 = 8;
|
1167
|
+
nth1 = 8;
|
1168
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
|
1169
|
+
} break;
|
1124
1170
|
case GGML_TYPE_Q8_0:
|
1125
1171
|
{
|
1126
1172
|
GGML_ASSERT(ne02 == 1);
|
@@ -1201,7 +1247,8 @@ void ggml_metal_graph_compute(
|
|
1201
1247
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
1202
1248
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
1203
1249
|
|
1204
|
-
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
1250
|
+
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
1251
|
+
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
1205
1252
|
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
1206
1253
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1207
1254
|
}
|
@@ -1233,6 +1280,8 @@ void ggml_metal_graph_compute(
|
|
1233
1280
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
1234
1281
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
1235
1282
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
1283
|
+
case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
|
1284
|
+
case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
|
1236
1285
|
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
|
1237
1286
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
1238
1287
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|