llama_cpp 0.7.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,6 +29,8 @@
29
29
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
30
30
  #define cublasCreate hipblasCreate
31
31
  #define cublasGemmEx hipblasGemmEx
32
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
33
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
32
34
  #define cublasHandle_t hipblasHandle_t
33
35
  #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
34
36
  #define cublasSetStream hipblasSetStream
@@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4326
4328
 
4327
4329
  const half * x = (const half *) vx;
4328
4330
 
4329
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4330
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4331
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4332
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4331
4333
  const int channel_x = channel / channel_x_divisor;
4332
4334
 
4333
- const int nrows_y = ncols_x;
4335
+ const int nrows_y = ncols_x;
4334
4336
  const int nrows_dst = nrows_x;
4335
- const int row_dst = row_x;
4337
+ const int row_dst = row_x;
4336
4338
 
4337
4339
  const int idst = channel*nrows_dst + row_dst;
4338
4340
 
@@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4345
4347
  break;
4346
4348
  }
4347
4349
 
4348
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4349
- const float xi = __half2float(x[ix]);
4350
-
4351
4350
  const int row_y = col_x;
4352
4351
 
4352
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4353
4353
  const int iy = channel*nrows_y + row_y;
4354
4354
 
4355
+ const float xi = __half2float(x[ix]);
4356
+
4355
4357
  tmp += xi * y[iy];
4356
4358
  }
4357
4359
 
@@ -5662,10 +5664,10 @@ void ggml_init_cublas() {
5662
5664
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5663
5665
  int64_t total_vram = 0;
5664
5666
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5665
- for (int64_t id = 0; id < g_device_count; ++id) {
5667
+ for (int id = 0; id < g_device_count; ++id) {
5666
5668
  cudaDeviceProp prop;
5667
5669
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5668
- fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5670
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5669
5671
 
5670
5672
  g_tensor_split[id] = total_vram;
5671
5673
  total_vram += prop.totalGlobalMem;
@@ -5675,15 +5677,15 @@ void ggml_init_cublas() {
5675
5677
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5676
5678
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5677
5679
  }
5678
- for (int64_t id = 0; id < g_device_count; ++id) {
5680
+ for (int id = 0; id < g_device_count; ++id) {
5679
5681
  g_tensor_split[id] /= total_vram;
5680
5682
  }
5681
5683
 
5682
- for (int64_t id = 0; id < g_device_count; ++id) {
5684
+ for (int id = 0; id < g_device_count; ++id) {
5683
5685
  CUDA_CHECK(ggml_cuda_set_device(id));
5684
5686
 
5685
5687
  // create cuda streams
5686
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5688
+ for (int is = 0; is < MAX_STREAMS; ++is) {
5687
5689
  CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5688
5690
  }
5689
5691
 
@@ -6252,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
6252
6254
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6253
6255
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6254
6256
 
6255
- GGML_ASSERT(src0_dd_i != nullptr);
6257
+ GGML_ASSERT(src0_dd_i != nullptr);
6256
6258
  GGML_ASSERT(src1_ddf_i != nullptr);
6257
- GGML_ASSERT(dst_dd_i != nullptr);
6258
-
6259
+ GGML_ASSERT(dst_dd_i != nullptr);
6259
6260
 
6260
6261
  const int64_t ne00 = src0->ne[0];
6261
-
6262
6262
  const int64_t ne10 = src1->ne[0];
6263
6263
 
6264
6264
  const int64_t ne0 = dst->ne[0];
6265
+
6265
6266
  const int64_t row_diff = row_high - row_low;
6266
6267
 
6267
6268
  int id;
@@ -7013,7 +7014,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
7013
7014
  }
7014
7015
 
7015
7016
  static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7016
- GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
7017
+ GGML_ASSERT(!ggml_is_transposed(src0));
7018
+ GGML_ASSERT(!ggml_is_transposed(src1));
7017
7019
  GGML_ASSERT(!ggml_is_permuted(src0));
7018
7020
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7019
7021
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -7023,11 +7025,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7023
7025
  const int64_t ne01 = src0->ne[1];
7024
7026
  const int64_t ne02 = src0->ne[2];
7025
7027
 
7026
- const int64_t ne12 = src1->ne[2];
7027
-
7028
7028
  const int64_t nb01 = src0->nb[1];
7029
7029
  const int64_t nb02 = src0->nb[2];
7030
7030
 
7031
+ const int64_t ne12 = src1->ne[2];
7032
+
7031
7033
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7032
7034
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7033
7035
 
@@ -7046,6 +7048,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7046
7048
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7047
7049
  }
7048
7050
 
7051
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7052
+ GGML_ASSERT(!ggml_is_transposed(src0));
7053
+ GGML_ASSERT(!ggml_is_transposed(src1));
7054
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
7056
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7057
+
7058
+ const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
7059
+ const int64_t ne01 = src0->ne[1];
7060
+ const int64_t ne02 = src0->ne[2];
7061
+ const int64_t ne03 = src0->ne[3];
7062
+
7063
+ const int64_t nb01 = src0->nb[1];
7064
+ const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
7065
+ const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
7066
+
7067
+ const int64_t ne10 = src1->ne[0];
7068
+ const int64_t ne11 = src1->ne[1];
7069
+ const int64_t ne12 = src1->ne[2];
7070
+ const int64_t ne13 = src1->ne[3];
7071
+
7072
+ const int64_t nb11 = src1->nb[1];
7073
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
7074
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
7075
+
7076
+ const int64_t ne1 = ggml_nelements(src1);
7077
+ const int64_t ne = ggml_nelements(dst);
7078
+
7079
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7080
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7081
+
7082
+ int id;
7083
+ CUDA_CHECK(cudaGetDevice(&id));
7084
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7085
+
7086
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7087
+ void * src0_ddq = src0_extra->data_device[g_main_device];
7088
+ half * src0_as_f16 = (half *) src0_ddq;
7089
+
7090
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7091
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7092
+
7093
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7094
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7095
+
7096
+ // convert src1 to fp16
7097
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7098
+ GGML_ASSERT(to_fp16_cuda != nullptr);
7099
+
7100
+ size_t src1_as = 0;
7101
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
7102
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
7103
+
7104
+ size_t dst_as = 0;
7105
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
7106
+
7107
+ GGML_ASSERT(ne12 % ne02 == 0);
7108
+ GGML_ASSERT(ne13 % ne03 == 0);
7109
+
7110
+ // broadcast factors
7111
+ const int64_t r2 = ne12/ne02;
7112
+ const int64_t r3 = ne13/ne03;
7113
+
7114
+ const half alpha_f16 = 1.0f;
7115
+ const half beta_f16 = 0.0f;
7116
+
7117
+ #if 0
7118
+ // use cublasGemmEx
7119
+ {
7120
+ for (int i13 = 0; i13 < ne13; ++i13) {
7121
+ for (int i12 = 0; i12 < ne12; ++i12) {
7122
+ int i03 = i13 / r3;
7123
+ int i02 = i12 / r2;
7124
+
7125
+ CUBLAS_CHECK(
7126
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7127
+ ne01, ne11, ne10,
7128
+ &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7129
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7130
+ &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
7131
+ CUBLAS_COMPUTE_16F,
7132
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7133
+ }
7134
+ }
7135
+ }
7136
+ #else
7137
+ if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
7138
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7139
+ // use cublasGemmStridedBatchedEx
7140
+ CUBLAS_CHECK(
7141
+ cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7142
+ ne01, ne11, ne10,
7143
+ &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7144
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
7145
+ &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
7146
+ ne12*ne13,
7147
+ CUBLAS_COMPUTE_16F,
7148
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7149
+ } else {
7150
+ // use cublasGemmBatchedEx
7151
+ // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7152
+ const int ne23 = ne12*ne13;
7153
+
7154
+ // TODO: avoid this alloc
7155
+ void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7156
+
7157
+ for (int i13 = 0; i13 < ne13; ++i13) {
7158
+ for (int i12 = 0; i12 < ne12; ++i12) {
7159
+ int i03 = i13 / r3;
7160
+ int i02 = i12 / r2;
7161
+
7162
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7163
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7164
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7165
+ }
7166
+ }
7167
+
7168
+ // allocate device memory for pointers
7169
+ void ** ptrs_as = nullptr;
7170
+ CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7171
+
7172
+ // TODO: this does not work for some reason -- not sure why?
7173
+ //size_t ptrs_s = 0;
7174
+ //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7175
+
7176
+ // copy pointers to device
7177
+ CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7178
+
7179
+ free(ptrs);
7180
+
7181
+ CUBLAS_CHECK(
7182
+ cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7183
+ ne01, ne11, ne10,
7184
+ &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7185
+ (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7186
+ &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7187
+ ne23,
7188
+ CUBLAS_COMPUTE_16F,
7189
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7190
+
7191
+ // free device memory for pointers
7192
+ CUDA_CHECK(cudaFree(ptrs_as));
7193
+ //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7194
+ }
7195
+ #endif
7196
+
7197
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7198
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
7199
+
7200
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
7201
+ ggml_cuda_pool_free(dst_f16, dst_as);
7202
+ }
7203
+
7049
7204
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7050
7205
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7051
7206
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
@@ -7058,10 +7213,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7058
7213
  }
7059
7214
  }
7060
7215
 
7216
+ // debug helpers
7217
+ //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
+ //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
7219
+ //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
7220
+ //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
7221
+ //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
+ //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
+
7061
7224
  if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7225
+ // KQ single-batch
7062
7226
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7063
- } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
7227
+ } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7228
+ // KQV single-batch
7064
7229
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7230
+ } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
7231
+ // KQ + KQV multi-batch
7232
+ ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7065
7233
  } else if (src0->type == GGML_TYPE_F32) {
7066
7234
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7067
7235
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
@@ -62,6 +62,7 @@ struct ggml_metal_context {
62
62
  GGML_METAL_DECL_KERNEL(mul);
63
63
  GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
64
64
  GGML_METAL_DECL_KERNEL(scale);
65
+ GGML_METAL_DECL_KERNEL(scale_4);
65
66
  GGML_METAL_DECL_KERNEL(silu);
66
67
  GGML_METAL_DECL_KERNEL(relu);
67
68
  GGML_METAL_DECL_KERNEL(gelu);
@@ -73,6 +74,8 @@ struct ggml_metal_context {
73
74
  GGML_METAL_DECL_KERNEL(get_rows_f16);
74
75
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
75
76
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
77
+ GGML_METAL_DECL_KERNEL(get_rows_q5_0);
78
+ GGML_METAL_DECL_KERNEL(get_rows_q5_1);
76
79
  GGML_METAL_DECL_KERNEL(get_rows_q8_0);
77
80
  GGML_METAL_DECL_KERNEL(get_rows_q2_K);
78
81
  GGML_METAL_DECL_KERNEL(get_rows_q3_K);
@@ -87,6 +90,8 @@ struct ggml_metal_context {
87
90
  GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
88
91
  GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
89
92
  GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
93
+ GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
94
+ GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
90
95
  GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
91
96
  GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
92
97
  GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
@@ -97,6 +102,8 @@ struct ggml_metal_context {
97
102
  GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
98
103
  GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
99
104
  GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
105
+ GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
106
+ GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
100
107
  GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
101
108
  GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
102
109
  GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
@@ -243,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
243
250
  GGML_METAL_ADD_KERNEL(mul);
244
251
  GGML_METAL_ADD_KERNEL(mul_row);
245
252
  GGML_METAL_ADD_KERNEL(scale);
253
+ GGML_METAL_ADD_KERNEL(scale_4);
246
254
  GGML_METAL_ADD_KERNEL(silu);
247
255
  GGML_METAL_ADD_KERNEL(relu);
248
256
  GGML_METAL_ADD_KERNEL(gelu);
@@ -254,6 +262,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
254
262
  GGML_METAL_ADD_KERNEL(get_rows_f16);
255
263
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
256
264
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
265
+ GGML_METAL_ADD_KERNEL(get_rows_q5_0);
266
+ GGML_METAL_ADD_KERNEL(get_rows_q5_1);
257
267
  GGML_METAL_ADD_KERNEL(get_rows_q8_0);
258
268
  GGML_METAL_ADD_KERNEL(get_rows_q2_K);
259
269
  GGML_METAL_ADD_KERNEL(get_rows_q3_K);
@@ -268,6 +278,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
268
278
  GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
269
279
  GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
270
280
  GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
281
+ GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
282
+ GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
271
283
  GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
272
284
  GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
273
285
  GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
@@ -278,8 +290,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
278
290
  GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
279
291
  GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
280
292
  GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
281
- GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
282
293
  GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
294
+ GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
295
+ GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
296
+ GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
283
297
  GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
284
298
  GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
285
299
  GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
@@ -335,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
335
349
  GGML_METAL_DEL_KERNEL(mul);
336
350
  GGML_METAL_DEL_KERNEL(mul_row);
337
351
  GGML_METAL_DEL_KERNEL(scale);
352
+ GGML_METAL_DEL_KERNEL(scale_4);
338
353
  GGML_METAL_DEL_KERNEL(silu);
339
354
  GGML_METAL_DEL_KERNEL(relu);
340
355
  GGML_METAL_DEL_KERNEL(gelu);
@@ -346,6 +361,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
346
361
  GGML_METAL_DEL_KERNEL(get_rows_f16);
347
362
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
348
363
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
364
+ GGML_METAL_DEL_KERNEL(get_rows_q5_0);
365
+ GGML_METAL_DEL_KERNEL(get_rows_q5_1);
349
366
  GGML_METAL_DEL_KERNEL(get_rows_q8_0);
350
367
  GGML_METAL_DEL_KERNEL(get_rows_q2_K);
351
368
  GGML_METAL_DEL_KERNEL(get_rows_q3_K);
@@ -360,6 +377,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
360
377
  GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
361
378
  GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
362
379
  GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
380
+ GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
381
+ GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
363
382
  GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
364
383
  GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
365
384
  GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
@@ -370,8 +389,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
370
389
  GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
371
390
  GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
372
391
  GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
373
- GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
374
392
  GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
393
+ GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
394
+ GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
395
+ GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
375
396
  GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
376
397
  GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
377
398
  GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
@@ -905,15 +926,20 @@ void ggml_metal_graph_compute(
905
926
 
906
927
  const float scale = *(const float *) src1->data;
907
928
 
908
- [encoder setComputePipelineState:ctx->pipeline_scale];
929
+ int64_t n = ggml_nelements(dst);
930
+
931
+ if (n % 4 == 0) {
932
+ n /= 4;
933
+ [encoder setComputePipelineState:ctx->pipeline_scale_4];
934
+ } else {
935
+ [encoder setComputePipelineState:ctx->pipeline_scale];
936
+ }
937
+
909
938
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
910
939
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
911
940
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
912
941
 
913
- const int64_t n = ggml_nelements(dst);
914
- GGML_ASSERT(n % 4 == 0);
915
-
916
- [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
942
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
917
943
  } break;
918
944
  case GGML_OP_UNARY:
919
945
  switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -1052,6 +1078,8 @@ void ggml_metal_graph_compute(
1052
1078
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
1053
1079
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
1054
1080
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
1081
+ case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
1082
+ case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
1055
1083
  case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
1056
1084
  case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
1057
1085
  case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
@@ -1121,6 +1149,24 @@ void ggml_metal_graph_compute(
1121
1149
  nth1 = 8;
1122
1150
  [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
1123
1151
  } break;
1152
+ case GGML_TYPE_Q5_0:
1153
+ {
1154
+ GGML_ASSERT(ne02 == 1);
1155
+ GGML_ASSERT(ne12 == 1);
1156
+
1157
+ nth0 = 8;
1158
+ nth1 = 8;
1159
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
1160
+ } break;
1161
+ case GGML_TYPE_Q5_1:
1162
+ {
1163
+ GGML_ASSERT(ne02 == 1);
1164
+ GGML_ASSERT(ne12 == 1);
1165
+
1166
+ nth0 = 8;
1167
+ nth1 = 8;
1168
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
1169
+ } break;
1124
1170
  case GGML_TYPE_Q8_0:
1125
1171
  {
1126
1172
  GGML_ASSERT(ne02 == 1);
@@ -1201,7 +1247,8 @@ void ggml_metal_graph_compute(
1201
1247
  [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
1202
1248
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
1203
1249
 
1204
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
1250
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
1251
+ src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
1205
1252
  src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
1206
1253
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1207
1254
  }
@@ -1233,6 +1280,8 @@ void ggml_metal_graph_compute(
1233
1280
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
1234
1281
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
1235
1282
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
1283
+ case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
1284
+ case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
1236
1285
  case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
1237
1286
  case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
1238
1287
  case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;