llama_cpp 0.10.3 → 0.10.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/ext/llama_cpp/src/ggml-backend.c +6 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +73 -63
- data/ext/llama_cpp/src/ggml-impl.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +43 -20
- data/ext/llama_cpp/src/ggml-metal.metal +464 -245
- data/ext/llama_cpp/src/ggml-opencl.h +9 -9
- data/ext/llama_cpp/src/ggml-quants.c +61 -57
- data/ext/llama_cpp/src/ggml.c +171 -5
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +222 -105
- data/ext/llama_cpp/src/llama.h +31 -32
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4db71bfe6290f23102180e5fb7544e4c752ac895f6fefdbb0c1b0b1e52660ebc
|
4
|
+
data.tar.gz: 72b1a13ae9c30230eb18eb83feba0c053297a5da22bd99fabee22cf3c1f9ec7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b7406aaf8d1612678635f686c5770d7ce592596caa5cd5687a66eefa088945f36ef6d40e440a8c29956fc1623c1d2713db9ee37b0f35a0d7d959564c7eb929b
|
7
|
+
data.tar.gz: 307e1471b63cbad71d65b816794ca497e9b883bd2430cb9cef6425f7972cbdd93d6ef68aa6358525f1f82a814c40041240df6de3183a669d7d01607ae20d98e7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.10.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.3...v0.10.4)] - 2024-01-06
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1710 to b1768.
|
4
|
+
|
1
5
|
## [[0.10.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.2...v0.10.3)] - 2023-12-29
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1686 to b1710.
|
data/LICENSE.txt
CHANGED
@@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
614
614
|
}
|
615
615
|
|
616
616
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
617
|
-
|
617
|
+
switch (op->op) {
|
618
|
+
case GGML_OP_MUL_MAT:
|
619
|
+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
620
|
+
default:
|
621
|
+
return true;
|
622
|
+
}
|
618
623
|
|
619
624
|
GGML_UNUSED(backend);
|
620
|
-
GGML_UNUSED(op);
|
621
625
|
}
|
622
626
|
|
623
627
|
static struct ggml_backend_i cpu_backend_i = {
|
@@ -119,7 +119,9 @@
|
|
119
119
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
120
120
|
#define CC_VOLTA 700
|
121
121
|
#define CC_OFFSET_AMD 1000000
|
122
|
+
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
122
123
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
124
|
+
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
123
125
|
|
124
126
|
#define GGML_CUDA_MAX_NODES 8192
|
125
127
|
|
@@ -133,7 +135,6 @@
|
|
133
135
|
|
134
136
|
// TODO: improve this to be correct for more hardware
|
135
137
|
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
136
|
-
// probably other such cases, and not sure what happens on AMD hardware
|
137
138
|
#if !defined(GGML_CUDA_FORCE_MMQ)
|
138
139
|
#define CUDA_USE_TENSOR_CORES
|
139
140
|
#endif
|
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
|
6662
6663
|
// pool with virtual memory
|
6663
6664
|
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
6664
6665
|
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
6665
|
-
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull <<
|
6666
|
+
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
6666
6667
|
|
6667
6668
|
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
6668
6669
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
@@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
7485
7486
|
const int64_t ne00 = src0->ne[0];
|
7486
7487
|
const int64_t row_diff = row_high - row_low;
|
7487
7488
|
|
7489
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7490
|
+
|
7488
7491
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
7489
7492
|
#ifdef GGML_CUDA_F16
|
7490
7493
|
cuda_pool_alloc<half> src1_dfloat_a;
|
@@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7577
7580
|
const int compute_capability = g_device_caps[id].cc;
|
7578
7581
|
|
7579
7582
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
7583
|
+
//printf("this branch\n");
|
7580
7584
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
7581
7585
|
cuda_pool_alloc<half> src0_as_f16;
|
7582
7586
|
if (src0->type != GGML_TYPE_F16) {
|
@@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7614
7618
|
|
7615
7619
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7616
7620
|
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
7617
|
-
}
|
7618
|
-
else {
|
7621
|
+
} else {
|
7619
7622
|
cuda_pool_alloc<float> src0_ddq_as_f32;
|
7623
|
+
cuda_pool_alloc<float> src1_ddq_as_f32;
|
7620
7624
|
|
7621
7625
|
if (src0->type != GGML_TYPE_F32) {
|
7622
7626
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7624
7628
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
7625
7629
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
7626
7630
|
}
|
7631
|
+
if (src1->type != GGML_TYPE_F32) {
|
7632
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
7633
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
7634
|
+
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
7635
|
+
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
7636
|
+
}
|
7637
|
+
|
7627
7638
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
7639
|
+
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
7628
7640
|
|
7629
7641
|
const float alpha = 1.0f;
|
7630
7642
|
const float beta = 0.0f;
|
@@ -7633,9 +7645,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7633
7645
|
CUBLAS_CHECK(
|
7634
7646
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7635
7647
|
row_diff, src1_ncols, ne10,
|
7636
|
-
&alpha, src0_ddf_i,
|
7637
|
-
|
7638
|
-
&beta, dst_dd_i,
|
7648
|
+
&alpha, src0_ddf_i, ne00,
|
7649
|
+
src1_ddf1_i, ne10,
|
7650
|
+
&beta, dst_dd_i, ldc));
|
7639
7651
|
}
|
7640
7652
|
|
7641
7653
|
(void) dst;
|
@@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(
|
|
8035
8047
|
|
8036
8048
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
8037
8049
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
8050
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
8038
8051
|
|
8039
8052
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
8040
8053
|
|
@@ -8481,9 +8494,9 @@ static __global__ void k_compute_batched_ptrs(
|
|
8481
8494
|
int64_t i03 = i13 / r3;
|
8482
8495
|
int64_t i02 = i12 / r2;
|
8483
8496
|
|
8484
|
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
8485
|
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
8486
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
8497
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
8498
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
8499
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
8487
8500
|
}
|
8488
8501
|
|
8489
8502
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8492
8505
|
|
8493
8506
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
8494
8507
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8495
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8496
|
-
|
8497
|
-
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
8498
|
-
const int64_t ne01 = src0->ne[1];
|
8499
|
-
const int64_t ne02 = src0->ne[2];
|
8500
|
-
const int64_t ne03 = src0->ne[3];
|
8501
8508
|
|
8502
|
-
|
8503
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
8504
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
8505
|
-
|
8506
|
-
const int64_t ne10 = src1->ne[0];
|
8507
|
-
const int64_t ne11 = src1->ne[1];
|
8508
|
-
const int64_t ne12 = src1->ne[2];
|
8509
|
-
const int64_t ne13 = src1->ne[3];
|
8510
|
-
|
8511
|
-
const int64_t nb11 = src1->nb[1];
|
8512
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8513
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8509
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
8514
8510
|
|
8515
|
-
const int64_t
|
8516
|
-
const int64_t ne = ggml_nelements(dst);
|
8511
|
+
const int64_t ne_dst = ggml_nelements(dst);
|
8517
8512
|
|
8518
8513
|
ggml_cuda_set_device(g_main_device);
|
8519
8514
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
@@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8522
8517
|
|
8523
8518
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8524
8519
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
8525
|
-
half *
|
8520
|
+
half * src0_f16 = (half *) src0_ddq;
|
8526
8521
|
|
8527
8522
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8528
8523
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
@@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8531
8526
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8532
8527
|
|
8533
8528
|
// convert src1 to fp16
|
8534
|
-
|
8535
|
-
|
8536
|
-
|
8537
|
-
|
8538
|
-
|
8529
|
+
cuda_pool_alloc<half> src1_f16_alloc;
|
8530
|
+
if (src1->type != GGML_TYPE_F16) {
|
8531
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8532
|
+
const int64_t ne_src1 = ggml_nelements(src1);
|
8533
|
+
src1_f16_alloc.alloc(ne_src1);
|
8534
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8535
|
+
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
8536
|
+
}
|
8537
|
+
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
8539
8538
|
|
8540
8539
|
cuda_pool_alloc<half> dst_f16;
|
8541
8540
|
char * dst_t;
|
@@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8557
8556
|
const void * beta = &beta_f16;
|
8558
8557
|
|
8559
8558
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8560
|
-
dst_t = (char *) dst_f16.alloc(
|
8559
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
8561
8560
|
|
8562
8561
|
nbd2 /= sizeof(float) / sizeof(half);
|
8563
8562
|
nbd3 /= sizeof(float) / sizeof(half);
|
@@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8604
8603
|
CUBLAS_CHECK(
|
8605
8604
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8606
8605
|
ne01, ne11, ne10,
|
8607
|
-
alpha, (const char *)
|
8608
|
-
(const char *)
|
8609
|
-
beta, ( char *)
|
8606
|
+
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
8607
|
+
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
8608
|
+
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
8610
8609
|
ne12*ne13,
|
8611
8610
|
cu_compute_type,
|
8612
8611
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
@@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8619
8618
|
|
8620
8619
|
dim3 block_dims(ne13, ne12);
|
8621
8620
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
8622
|
-
|
8621
|
+
src0_f16, src1_f16, dst_t,
|
8623
8622
|
ptrs_src.get(), ptrs_dst.get(),
|
8624
8623
|
ne12, ne13,
|
8625
8624
|
ne23,
|
8626
8625
|
nb02, nb03,
|
8627
|
-
nb12
|
8626
|
+
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
8627
|
+
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
8628
8628
|
nbd2, nbd3,
|
8629
8629
|
r2, r3);
|
8630
8630
|
CUDA_CHECK(cudaGetLastError());
|
@@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8632
8632
|
CUBLAS_CHECK(
|
8633
8633
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8634
8634
|
ne01, ne11, ne10,
|
8635
|
-
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/
|
8636
|
-
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/
|
8635
|
+
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
8636
|
+
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
8637
8637
|
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
8638
8638
|
ne23,
|
8639
8639
|
cu_compute_type,
|
@@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8643
8643
|
|
8644
8644
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8645
8645
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8646
|
-
to_fp32_cuda(dst_f16.get(), dst_ddf,
|
8646
|
+
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
8647
8647
|
}
|
8648
8648
|
}
|
8649
8649
|
|
@@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8662
8662
|
}
|
8663
8663
|
}
|
8664
8664
|
|
8665
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8666
|
+
|
8667
|
+
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
8668
|
+
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
8665
8669
|
#ifdef CUDA_USE_TENSOR_CORES
|
8666
|
-
|
8670
|
+
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
8671
|
+
#endif // CUDA_USE_TENSOR_CORES
|
8672
|
+
|
8667
8673
|
#else
|
8668
|
-
|
8669
|
-
|
8674
|
+
|
8675
|
+
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
|
8676
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8677
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
8678
|
+
// when tensor cores are available, use them for large batch size
|
8679
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
8680
|
+
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
|
8681
|
+
#endif // CUDA_USE_TENSOR_CORES
|
8682
|
+
|
8683
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8670
8684
|
|
8671
8685
|
// debug helpers
|
8672
8686
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
@@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8676
8690
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
8677
8691
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
8678
8692
|
|
8679
|
-
if (!split && all_on_device && !
|
8693
|
+
if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
8680
8694
|
// KQ single-batch
|
8681
8695
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
8682
|
-
} else if (!split && all_on_device && !
|
8696
|
+
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
8683
8697
|
// KQV single-batch
|
8684
8698
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
8685
|
-
} else if (!split && all_on_device &&
|
8699
|
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
8686
8700
|
// KQ + KQV multi-batch
|
8687
8701
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
8688
8702
|
} else if (src0->type == GGML_TYPE_F32) {
|
8689
8703
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
8690
8704
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
8691
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
8705
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
8692
8706
|
#ifdef GGML_CUDA_FORCE_DMMV
|
8693
8707
|
const bool use_mul_mat_vec_q = false;
|
8694
8708
|
#else
|
@@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8702
8716
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
8703
8717
|
}
|
8704
8718
|
} else {
|
8705
|
-
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8706
|
-
|
8707
|
-
// when tensor cores are available, use them for large batch size
|
8708
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
8709
|
-
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
8710
|
-
use_mul_mat_q = false;
|
8711
|
-
}
|
8712
|
-
|
8713
8719
|
if (use_mul_mat_q) {
|
8714
8720
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
8715
8721
|
} else {
|
@@ -10033,14 +10039,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10033
10039
|
}
|
10034
10040
|
return false;
|
10035
10041
|
} break;
|
10042
|
+
case GGML_OP_DUP:
|
10043
|
+
case GGML_OP_REPEAT:
|
10044
|
+
case GGML_OP_CONCAT:
|
10045
|
+
{
|
10046
|
+
ggml_type src0_type = op->src[0]->type;
|
10047
|
+
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
10048
|
+
} break;
|
10036
10049
|
case GGML_OP_NONE:
|
10037
10050
|
case GGML_OP_RESHAPE:
|
10038
10051
|
case GGML_OP_VIEW:
|
10039
10052
|
case GGML_OP_PERMUTE:
|
10040
10053
|
case GGML_OP_TRANSPOSE:
|
10041
10054
|
case GGML_OP_NORM:
|
10042
|
-
case GGML_OP_REPEAT:
|
10043
|
-
case GGML_OP_DUP:
|
10044
10055
|
case GGML_OP_ADD:
|
10045
10056
|
case GGML_OP_MUL:
|
10046
10057
|
case GGML_OP_DIV:
|
@@ -10057,7 +10068,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10057
10068
|
case GGML_OP_SUM_ROWS:
|
10058
10069
|
case GGML_OP_ARGSORT:
|
10059
10070
|
case GGML_OP_ACC:
|
10060
|
-
case GGML_OP_CONCAT:
|
10061
10071
|
case GGML_OP_GROUP_NORM:
|
10062
10072
|
case GGML_OP_UPSCALE:
|
10063
10073
|
case GGML_OP_PAD:
|
@@ -87,6 +87,7 @@ struct ggml_metal_context {
|
|
87
87
|
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
88
88
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
89
89
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
90
|
+
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
90
91
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
91
92
|
GGML_METAL_DECL_KERNEL(group_norm);
|
92
93
|
GGML_METAL_DECL_KERNEL(norm);
|
@@ -257,13 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
257
258
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
258
259
|
#endif
|
259
260
|
NSError * error = nil;
|
260
|
-
NSString * libPath = [bundle pathForResource:@"
|
261
|
+
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
|
261
262
|
if (libPath != nil) {
|
263
|
+
// pre-compiled library found
|
262
264
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
263
265
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
264
266
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
265
267
|
} else {
|
266
|
-
GGML_METAL_LOG_INFO("%s:
|
268
|
+
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
|
267
269
|
|
268
270
|
NSString * sourcePath;
|
269
271
|
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
@@ -291,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
291
293
|
options = [MTLCompileOptions new];
|
292
294
|
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
293
295
|
#endif
|
296
|
+
// try to disable fast-math
|
297
|
+
// NOTE: this seems to have no effect whatsoever
|
298
|
+
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
|
299
|
+
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
300
|
+
// and go through the "pre-compiled library found" path above
|
301
|
+
//[options setFastMathEnabled:false];
|
302
|
+
|
294
303
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
295
304
|
}
|
296
305
|
|
@@ -369,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
369
378
|
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
370
379
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
371
380
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
381
|
+
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
372
382
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
373
383
|
GGML_METAL_ADD_KERNEL(group_norm);
|
374
384
|
GGML_METAL_ADD_KERNEL(norm);
|
@@ -491,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
491
501
|
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
|
492
502
|
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
493
503
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
504
|
+
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
494
505
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
495
506
|
GGML_METAL_DEL_KERNEL(group_norm);
|
496
507
|
GGML_METAL_DEL_KERNEL(norm);
|
@@ -1230,7 +1241,7 @@ void ggml_metal_graph_compute(
|
|
1230
1241
|
// not sure how to avoid this
|
1231
1242
|
// TODO: make a simpler cpy_bytes kernel
|
1232
1243
|
|
1233
|
-
const int nth = MIN(
|
1244
|
+
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
|
1234
1245
|
|
1235
1246
|
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
1236
1247
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1285,7 +1296,7 @@ void ggml_metal_graph_compute(
|
|
1285
1296
|
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
1286
1297
|
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
1287
1298
|
|
1288
|
-
const int nth = MIN(
|
1299
|
+
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
|
1289
1300
|
|
1290
1301
|
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1291
1302
|
} break;
|
@@ -1649,6 +1660,10 @@ void ggml_metal_graph_compute(
|
|
1649
1660
|
}
|
1650
1661
|
};
|
1651
1662
|
|
1663
|
+
if (ggml_is_quantized(src0t)) {
|
1664
|
+
GGML_ASSERT(ne00 >= nth0*nth1);
|
1665
|
+
}
|
1666
|
+
|
1652
1667
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1653
1668
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1654
1669
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
@@ -1707,6 +1722,9 @@ void ggml_metal_graph_compute(
|
|
1707
1722
|
// TODO: make this more general
|
1708
1723
|
GGML_ASSERT(n_as <= 8);
|
1709
1724
|
|
1725
|
+
// max size of the src1ids array in the kernel stack
|
1726
|
+
GGML_ASSERT(ne11 <= 512);
|
1727
|
+
|
1710
1728
|
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
1711
1729
|
|
1712
1730
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
@@ -1724,9 +1742,6 @@ void ggml_metal_graph_compute(
|
|
1724
1742
|
GGML_ASSERT(!ggml_is_transposed(src2));
|
1725
1743
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
1726
1744
|
|
1727
|
-
GGML_ASSERT(ne20 % 32 == 0);
|
1728
|
-
// !!!!!!!!! TODO: this assert is probably required but not sure!
|
1729
|
-
//GGML_ASSERT(ne20 >= 64);
|
1730
1745
|
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
1731
1746
|
|
1732
1747
|
const uint r2 = ne12/ne22;
|
@@ -1734,22 +1749,22 @@ void ggml_metal_graph_compute(
|
|
1734
1749
|
|
1735
1750
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
1736
1751
|
// to the matrix-vector kernel
|
1737
|
-
int ne11_mm_min =
|
1752
|
+
int ne11_mm_min = n_as;
|
1738
1753
|
|
1739
1754
|
const int idx = ((int32_t *) dst->op_params)[0];
|
1740
1755
|
|
1741
1756
|
// batch size
|
1742
1757
|
GGML_ASSERT(ne01 == ne11);
|
1743
1758
|
|
1744
|
-
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
|
1745
|
-
|
1746
1759
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
1747
1760
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
1748
1761
|
// !!!
|
1749
1762
|
// TODO: for now, always use mat-vec kernels until we figure out how to improve the
|
1750
1763
|
// indirect matrix multiplication
|
1751
1764
|
// !!!
|
1752
|
-
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
1765
|
+
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
1766
|
+
ne20 % 32 == 0 && ne20 >= 64 &&
|
1767
|
+
ne11 > ne11_mm_min) {
|
1753
1768
|
switch (src2->type) {
|
1754
1769
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
|
1755
1770
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
|
@@ -1779,14 +1794,15 @@ void ggml_metal_graph_compute(
|
|
1779
1794
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
1780
1795
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
1781
1796
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
1782
|
-
[encoder setBytes:&
|
1797
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
1783
1798
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
1784
1799
|
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
|
1785
1800
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
1786
1801
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
1787
1802
|
// TODO: how to make this an array? read Metal docs
|
1788
|
-
for (int j = 0; j <
|
1789
|
-
|
1803
|
+
for (int j = 0; j < 8; ++j) {
|
1804
|
+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
1805
|
+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
1790
1806
|
|
1791
1807
|
size_t offs_src_cur = 0;
|
1792
1808
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
@@ -1796,8 +1812,7 @@ void ggml_metal_graph_compute(
|
|
1796
1812
|
|
1797
1813
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
1798
1814
|
|
1799
|
-
|
1800
|
-
[encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1815
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1801
1816
|
} else {
|
1802
1817
|
int nth0 = 32;
|
1803
1818
|
int nth1 = 1;
|
@@ -1880,11 +1895,17 @@ void ggml_metal_graph_compute(
|
|
1880
1895
|
} break;
|
1881
1896
|
default:
|
1882
1897
|
{
|
1883
|
-
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)
|
1898
|
+
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
1884
1899
|
GGML_ASSERT(false && "not implemented");
|
1885
1900
|
}
|
1886
1901
|
};
|
1887
1902
|
|
1903
|
+
if (ggml_is_quantized(src2t)) {
|
1904
|
+
GGML_ASSERT(ne20 >= nth0*nth1);
|
1905
|
+
}
|
1906
|
+
|
1907
|
+
const int64_t _ne1 = 1; // kernels needs a reference in constant memory
|
1908
|
+
|
1888
1909
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1889
1910
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1890
1911
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
@@ -1909,8 +1930,9 @@ void ggml_metal_graph_compute(
|
|
1909
1930
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
1910
1931
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
1911
1932
|
// TODO: how to make this an array? read Metal docs
|
1912
|
-
for (int j = 0; j <
|
1913
|
-
|
1933
|
+
for (int j = 0; j < 8; ++j) {
|
1934
|
+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
1935
|
+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
1914
1936
|
|
1915
1937
|
size_t offs_src_cur = 0;
|
1916
1938
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
@@ -1959,6 +1981,7 @@ void ggml_metal_graph_compute(
|
|
1959
1981
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
1960
1982
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
1961
1983
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
1984
|
+
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
1962
1985
|
default: GGML_ASSERT(false && "not implemented");
|
1963
1986
|
}
|
1964
1987
|
|
@@ -2229,7 +2252,7 @@ void ggml_metal_graph_compute(
|
|
2229
2252
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
2230
2253
|
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
2231
2254
|
|
2232
|
-
const int nth = MIN(
|
2255
|
+
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
|
2233
2256
|
|
2234
2257
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2235
2258
|
} break;
|