llama_cpp 0.10.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/LICENSE.txt +1 -1
- data/ext/llama_cpp/extconf.rb +35 -110
- data/ext/llama_cpp/llama_cpp.cpp +52 -28
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -1
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/Makefile +758 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
- data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
- metadata +30 -27
- data/ext/llama_cpp/src/ggml-opencl.h +0 -25
- data/ext/llama_cpp/src/llama-util.h +0 -546
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
614
614
|
}
|
615
615
|
|
616
616
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
617
|
-
|
617
|
+
switch (op->op) {
|
618
|
+
case GGML_OP_MUL_MAT:
|
619
|
+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
620
|
+
default:
|
621
|
+
return true;
|
622
|
+
}
|
618
623
|
|
619
624
|
GGML_UNUSED(backend);
|
620
|
-
GGML_UNUSED(op);
|
621
625
|
}
|
622
626
|
|
623
627
|
static struct ggml_backend_i cpu_backend_i = {
|
@@ -119,7 +119,9 @@
|
|
119
119
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
120
120
|
#define CC_VOLTA 700
|
121
121
|
#define CC_OFFSET_AMD 1000000
|
122
|
+
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
122
123
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
124
|
+
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
123
125
|
|
124
126
|
#define GGML_CUDA_MAX_NODES 8192
|
125
127
|
|
@@ -133,7 +135,6 @@
|
|
133
135
|
|
134
136
|
// TODO: improve this to be correct for more hardware
|
135
137
|
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
136
|
-
// probably other such cases, and not sure what happens on AMD hardware
|
137
138
|
#if !defined(GGML_CUDA_FORCE_MMQ)
|
138
139
|
#define CUDA_USE_TENSOR_CORES
|
139
140
|
#endif
|
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
|
6662
6663
|
// pool with virtual memory
|
6663
6664
|
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
6664
6665
|
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
6665
|
-
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull <<
|
6666
|
+
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
6666
6667
|
|
6667
6668
|
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
6668
6669
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
@@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
7485
7486
|
const int64_t ne00 = src0->ne[0];
|
7486
7487
|
const int64_t row_diff = row_high - row_low;
|
7487
7488
|
|
7489
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7490
|
+
|
7488
7491
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
7489
7492
|
#ifdef GGML_CUDA_F16
|
7490
7493
|
cuda_pool_alloc<half> src1_dfloat_a;
|
@@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7577
7580
|
const int compute_capability = g_device_caps[id].cc;
|
7578
7581
|
|
7579
7582
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
7583
|
+
//printf("this branch\n");
|
7580
7584
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
7581
7585
|
cuda_pool_alloc<half> src0_as_f16;
|
7582
7586
|
if (src0->type != GGML_TYPE_F16) {
|
@@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7614
7618
|
|
7615
7619
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7616
7620
|
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
7617
|
-
}
|
7618
|
-
else {
|
7621
|
+
} else {
|
7619
7622
|
cuda_pool_alloc<float> src0_ddq_as_f32;
|
7623
|
+
cuda_pool_alloc<float> src1_ddq_as_f32;
|
7620
7624
|
|
7621
7625
|
if (src0->type != GGML_TYPE_F32) {
|
7622
7626
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7624
7628
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
7625
7629
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
7626
7630
|
}
|
7631
|
+
if (src1->type != GGML_TYPE_F32) {
|
7632
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
7633
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
7634
|
+
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
7635
|
+
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
7636
|
+
}
|
7637
|
+
|
7627
7638
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
7639
|
+
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
7628
7640
|
|
7629
7641
|
const float alpha = 1.0f;
|
7630
7642
|
const float beta = 0.0f;
|
@@ -7633,9 +7645,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
7633
7645
|
CUBLAS_CHECK(
|
7634
7646
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7635
7647
|
row_diff, src1_ncols, ne10,
|
7636
|
-
&alpha, src0_ddf_i,
|
7637
|
-
|
7638
|
-
&beta, dst_dd_i,
|
7648
|
+
&alpha, src0_ddf_i, ne00,
|
7649
|
+
src1_ddf1_i, ne10,
|
7650
|
+
&beta, dst_dd_i, ldc));
|
7639
7651
|
}
|
7640
7652
|
|
7641
7653
|
(void) dst;
|
@@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(
|
|
8035
8047
|
|
8036
8048
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
8037
8049
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
8050
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
8038
8051
|
|
8039
8052
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
8040
8053
|
|
@@ -8481,9 +8494,9 @@ static __global__ void k_compute_batched_ptrs(
|
|
8481
8494
|
int64_t i03 = i13 / r3;
|
8482
8495
|
int64_t i02 = i12 / r2;
|
8483
8496
|
|
8484
|
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
8485
|
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
8486
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
8497
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
8498
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
8499
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
8487
8500
|
}
|
8488
8501
|
|
8489
8502
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8492
8505
|
|
8493
8506
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
8494
8507
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8495
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8496
|
-
|
8497
|
-
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
8498
|
-
const int64_t ne01 = src0->ne[1];
|
8499
|
-
const int64_t ne02 = src0->ne[2];
|
8500
|
-
const int64_t ne03 = src0->ne[3];
|
8501
8508
|
|
8502
|
-
|
8503
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
8504
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
8505
|
-
|
8506
|
-
const int64_t ne10 = src1->ne[0];
|
8507
|
-
const int64_t ne11 = src1->ne[1];
|
8508
|
-
const int64_t ne12 = src1->ne[2];
|
8509
|
-
const int64_t ne13 = src1->ne[3];
|
8510
|
-
|
8511
|
-
const int64_t nb11 = src1->nb[1];
|
8512
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
8513
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
8509
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
8514
8510
|
|
8515
|
-
const int64_t
|
8516
|
-
const int64_t ne = ggml_nelements(dst);
|
8511
|
+
const int64_t ne_dst = ggml_nelements(dst);
|
8517
8512
|
|
8518
8513
|
ggml_cuda_set_device(g_main_device);
|
8519
8514
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
@@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8522
8517
|
|
8523
8518
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
8524
8519
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
8525
|
-
half *
|
8520
|
+
half * src0_f16 = (half *) src0_ddq;
|
8526
8521
|
|
8527
8522
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
8528
8523
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
@@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8531
8526
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
8532
8527
|
|
8533
8528
|
// convert src1 to fp16
|
8534
|
-
|
8535
|
-
|
8536
|
-
|
8537
|
-
|
8538
|
-
|
8529
|
+
cuda_pool_alloc<half> src1_f16_alloc;
|
8530
|
+
if (src1->type != GGML_TYPE_F16) {
|
8531
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8532
|
+
const int64_t ne_src1 = ggml_nelements(src1);
|
8533
|
+
src1_f16_alloc.alloc(ne_src1);
|
8534
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8535
|
+
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
8536
|
+
}
|
8537
|
+
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
8539
8538
|
|
8540
8539
|
cuda_pool_alloc<half> dst_f16;
|
8541
8540
|
char * dst_t;
|
@@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8557
8556
|
const void * beta = &beta_f16;
|
8558
8557
|
|
8559
8558
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8560
|
-
dst_t = (char *) dst_f16.alloc(
|
8559
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
8561
8560
|
|
8562
8561
|
nbd2 /= sizeof(float) / sizeof(half);
|
8563
8562
|
nbd3 /= sizeof(float) / sizeof(half);
|
@@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8604
8603
|
CUBLAS_CHECK(
|
8605
8604
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8606
8605
|
ne01, ne11, ne10,
|
8607
|
-
alpha, (const char *)
|
8608
|
-
(const char *)
|
8609
|
-
beta, ( char *)
|
8606
|
+
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
8607
|
+
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
8608
|
+
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
8610
8609
|
ne12*ne13,
|
8611
8610
|
cu_compute_type,
|
8612
8611
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
@@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8619
8618
|
|
8620
8619
|
dim3 block_dims(ne13, ne12);
|
8621
8620
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
8622
|
-
|
8621
|
+
src0_f16, src1_f16, dst_t,
|
8623
8622
|
ptrs_src.get(), ptrs_dst.get(),
|
8624
8623
|
ne12, ne13,
|
8625
8624
|
ne23,
|
8626
8625
|
nb02, nb03,
|
8627
|
-
nb12
|
8626
|
+
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
8627
|
+
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
8628
8628
|
nbd2, nbd3,
|
8629
8629
|
r2, r3);
|
8630
8630
|
CUDA_CHECK(cudaGetLastError());
|
@@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8632
8632
|
CUBLAS_CHECK(
|
8633
8633
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8634
8634
|
ne01, ne11, ne10,
|
8635
|
-
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/
|
8636
|
-
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/
|
8635
|
+
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
8636
|
+
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
8637
8637
|
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
8638
8638
|
ne23,
|
8639
8639
|
cu_compute_type,
|
@@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8643
8643
|
|
8644
8644
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8645
8645
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8646
|
-
to_fp32_cuda(dst_f16.get(), dst_ddf,
|
8646
|
+
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
8647
8647
|
}
|
8648
8648
|
}
|
8649
8649
|
|
@@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8662
8662
|
}
|
8663
8663
|
}
|
8664
8664
|
|
8665
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8666
|
+
|
8667
|
+
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
8668
|
+
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
8665
8669
|
#ifdef CUDA_USE_TENSOR_CORES
|
8666
|
-
|
8670
|
+
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
8671
|
+
#endif // CUDA_USE_TENSOR_CORES
|
8672
|
+
|
8667
8673
|
#else
|
8668
|
-
|
8669
|
-
|
8674
|
+
|
8675
|
+
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
|
8676
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8677
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
8678
|
+
// when tensor cores are available, use them for large batch size
|
8679
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
8680
|
+
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
|
8681
|
+
#endif // CUDA_USE_TENSOR_CORES
|
8682
|
+
|
8683
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8670
8684
|
|
8671
8685
|
// debug helpers
|
8672
8686
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
@@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8676
8690
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
8677
8691
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
8678
8692
|
|
8679
|
-
if (!split && all_on_device && !
|
8693
|
+
if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
8680
8694
|
// KQ single-batch
|
8681
8695
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
8682
|
-
} else if (!split && all_on_device && !
|
8696
|
+
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
8683
8697
|
// KQV single-batch
|
8684
8698
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
8685
|
-
} else if (!split && all_on_device &&
|
8699
|
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
8686
8700
|
// KQ + KQV multi-batch
|
8687
8701
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
8688
8702
|
} else if (src0->type == GGML_TYPE_F32) {
|
8689
8703
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
8690
8704
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
8691
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
8705
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
8692
8706
|
#ifdef GGML_CUDA_FORCE_DMMV
|
8693
8707
|
const bool use_mul_mat_vec_q = false;
|
8694
8708
|
#else
|
@@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8702
8716
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
8703
8717
|
}
|
8704
8718
|
} else {
|
8705
|
-
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
8706
|
-
|
8707
|
-
// when tensor cores are available, use them for large batch size
|
8708
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
8709
|
-
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
8710
|
-
use_mul_mat_q = false;
|
8711
|
-
}
|
8712
|
-
|
8713
8719
|
if (use_mul_mat_q) {
|
8714
8720
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
8715
8721
|
} else {
|
@@ -10033,14 +10039,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10033
10039
|
}
|
10034
10040
|
return false;
|
10035
10041
|
} break;
|
10042
|
+
case GGML_OP_DUP:
|
10043
|
+
case GGML_OP_REPEAT:
|
10044
|
+
case GGML_OP_CONCAT:
|
10045
|
+
{
|
10046
|
+
ggml_type src0_type = op->src[0]->type;
|
10047
|
+
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
10048
|
+
} break;
|
10036
10049
|
case GGML_OP_NONE:
|
10037
10050
|
case GGML_OP_RESHAPE:
|
10038
10051
|
case GGML_OP_VIEW:
|
10039
10052
|
case GGML_OP_PERMUTE:
|
10040
10053
|
case GGML_OP_TRANSPOSE:
|
10041
10054
|
case GGML_OP_NORM:
|
10042
|
-
case GGML_OP_REPEAT:
|
10043
|
-
case GGML_OP_DUP:
|
10044
10055
|
case GGML_OP_ADD:
|
10045
10056
|
case GGML_OP_MUL:
|
10046
10057
|
case GGML_OP_DIV:
|
@@ -10057,7 +10068,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10057
10068
|
case GGML_OP_SUM_ROWS:
|
10058
10069
|
case GGML_OP_ARGSORT:
|
10059
10070
|
case GGML_OP_ACC:
|
10060
|
-
case GGML_OP_CONCAT:
|
10061
10071
|
case GGML_OP_GROUP_NORM:
|
10062
10072
|
case GGML_OP_UPSCALE:
|
10063
10073
|
case GGML_OP_PAD:
|
@@ -87,6 +87,7 @@ struct ggml_metal_context {
|
|
87
87
|
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
88
88
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
89
89
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
90
|
+
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
90
91
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
91
92
|
GGML_METAL_DECL_KERNEL(group_norm);
|
92
93
|
GGML_METAL_DECL_KERNEL(norm);
|
@@ -257,13 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
257
258
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
258
259
|
#endif
|
259
260
|
NSError * error = nil;
|
260
|
-
NSString * libPath = [bundle pathForResource:@"
|
261
|
+
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
|
261
262
|
if (libPath != nil) {
|
263
|
+
// pre-compiled library found
|
262
264
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
263
265
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
264
266
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
265
267
|
} else {
|
266
|
-
GGML_METAL_LOG_INFO("%s:
|
268
|
+
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
|
267
269
|
|
268
270
|
NSString * sourcePath;
|
269
271
|
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
@@ -291,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
291
293
|
options = [MTLCompileOptions new];
|
292
294
|
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
293
295
|
#endif
|
296
|
+
// try to disable fast-math
|
297
|
+
// NOTE: this seems to have no effect whatsoever
|
298
|
+
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
|
299
|
+
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
300
|
+
// and go through the "pre-compiled library found" path above
|
301
|
+
//[options setFastMathEnabled:false];
|
302
|
+
|
294
303
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
295
304
|
}
|
296
305
|
|
@@ -369,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
369
378
|
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
370
379
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
371
380
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
381
|
+
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
372
382
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
373
383
|
GGML_METAL_ADD_KERNEL(group_norm);
|
374
384
|
GGML_METAL_ADD_KERNEL(norm);
|
@@ -491,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
491
501
|
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
|
492
502
|
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
493
503
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
504
|
+
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
494
505
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
495
506
|
GGML_METAL_DEL_KERNEL(group_norm);
|
496
507
|
GGML_METAL_DEL_KERNEL(norm);
|
@@ -1230,7 +1241,7 @@ void ggml_metal_graph_compute(
|
|
1230
1241
|
// not sure how to avoid this
|
1231
1242
|
// TODO: make a simpler cpy_bytes kernel
|
1232
1243
|
|
1233
|
-
const int nth = MIN(
|
1244
|
+
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
|
1234
1245
|
|
1235
1246
|
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
1236
1247
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1285,7 +1296,7 @@ void ggml_metal_graph_compute(
|
|
1285
1296
|
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
1286
1297
|
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
1287
1298
|
|
1288
|
-
const int nth = MIN(
|
1299
|
+
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
|
1289
1300
|
|
1290
1301
|
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1291
1302
|
} break;
|
@@ -1649,6 +1660,10 @@ void ggml_metal_graph_compute(
|
|
1649
1660
|
}
|
1650
1661
|
};
|
1651
1662
|
|
1663
|
+
if (ggml_is_quantized(src0t)) {
|
1664
|
+
GGML_ASSERT(ne00 >= nth0*nth1);
|
1665
|
+
}
|
1666
|
+
|
1652
1667
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1653
1668
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1654
1669
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
@@ -1707,6 +1722,9 @@ void ggml_metal_graph_compute(
|
|
1707
1722
|
// TODO: make this more general
|
1708
1723
|
GGML_ASSERT(n_as <= 8);
|
1709
1724
|
|
1725
|
+
// max size of the src1ids array in the kernel stack
|
1726
|
+
GGML_ASSERT(ne11 <= 512);
|
1727
|
+
|
1710
1728
|
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
1711
1729
|
|
1712
1730
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
@@ -1724,9 +1742,6 @@ void ggml_metal_graph_compute(
|
|
1724
1742
|
GGML_ASSERT(!ggml_is_transposed(src2));
|
1725
1743
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
1726
1744
|
|
1727
|
-
GGML_ASSERT(ne20 % 32 == 0);
|
1728
|
-
// !!!!!!!!! TODO: this assert is probably required but not sure!
|
1729
|
-
//GGML_ASSERT(ne20 >= 64);
|
1730
1745
|
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
1731
1746
|
|
1732
1747
|
const uint r2 = ne12/ne22;
|
@@ -1734,22 +1749,22 @@ void ggml_metal_graph_compute(
|
|
1734
1749
|
|
1735
1750
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
1736
1751
|
// to the matrix-vector kernel
|
1737
|
-
int ne11_mm_min =
|
1752
|
+
int ne11_mm_min = n_as;
|
1738
1753
|
|
1739
1754
|
const int idx = ((int32_t *) dst->op_params)[0];
|
1740
1755
|
|
1741
1756
|
// batch size
|
1742
1757
|
GGML_ASSERT(ne01 == ne11);
|
1743
1758
|
|
1744
|
-
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
|
1745
|
-
|
1746
1759
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
1747
1760
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
1748
1761
|
// !!!
|
1749
1762
|
// TODO: for now, always use mat-vec kernels until we figure out how to improve the
|
1750
1763
|
// indirect matrix multiplication
|
1751
1764
|
// !!!
|
1752
|
-
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
1765
|
+
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
1766
|
+
ne20 % 32 == 0 && ne20 >= 64 &&
|
1767
|
+
ne11 > ne11_mm_min) {
|
1753
1768
|
switch (src2->type) {
|
1754
1769
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
|
1755
1770
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
|
@@ -1779,14 +1794,15 @@ void ggml_metal_graph_compute(
|
|
1779
1794
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
1780
1795
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
1781
1796
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
1782
|
-
[encoder setBytes:&
|
1797
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
1783
1798
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
1784
1799
|
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
|
1785
1800
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
1786
1801
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
1787
1802
|
// TODO: how to make this an array? read Metal docs
|
1788
|
-
for (int j = 0; j <
|
1789
|
-
|
1803
|
+
for (int j = 0; j < 8; ++j) {
|
1804
|
+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
1805
|
+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
1790
1806
|
|
1791
1807
|
size_t offs_src_cur = 0;
|
1792
1808
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
@@ -1796,8 +1812,7 @@ void ggml_metal_graph_compute(
|
|
1796
1812
|
|
1797
1813
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
1798
1814
|
|
1799
|
-
|
1800
|
-
[encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1815
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1801
1816
|
} else {
|
1802
1817
|
int nth0 = 32;
|
1803
1818
|
int nth1 = 1;
|
@@ -1880,11 +1895,17 @@ void ggml_metal_graph_compute(
|
|
1880
1895
|
} break;
|
1881
1896
|
default:
|
1882
1897
|
{
|
1883
|
-
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)
|
1898
|
+
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
1884
1899
|
GGML_ASSERT(false && "not implemented");
|
1885
1900
|
}
|
1886
1901
|
};
|
1887
1902
|
|
1903
|
+
if (ggml_is_quantized(src2t)) {
|
1904
|
+
GGML_ASSERT(ne20 >= nth0*nth1);
|
1905
|
+
}
|
1906
|
+
|
1907
|
+
const int64_t _ne1 = 1; // kernels needs a reference in constant memory
|
1908
|
+
|
1888
1909
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1889
1910
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1890
1911
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
@@ -1909,8 +1930,9 @@ void ggml_metal_graph_compute(
|
|
1909
1930
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
1910
1931
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
1911
1932
|
// TODO: how to make this an array? read Metal docs
|
1912
|
-
for (int j = 0; j <
|
1913
|
-
|
1933
|
+
for (int j = 0; j < 8; ++j) {
|
1934
|
+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
1935
|
+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
1914
1936
|
|
1915
1937
|
size_t offs_src_cur = 0;
|
1916
1938
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
@@ -1959,6 +1981,7 @@ void ggml_metal_graph_compute(
|
|
1959
1981
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
1960
1982
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
1961
1983
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
1984
|
+
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
1962
1985
|
default: GGML_ASSERT(false && "not implemented");
|
1963
1986
|
}
|
1964
1987
|
|
@@ -2229,7 +2252,7 @@ void ggml_metal_graph_compute(
|
|
2229
2252
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
2230
2253
|
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
2231
2254
|
|
2232
|
-
const int nth = MIN(
|
2255
|
+
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
|
2233
2256
|
|
2234
2257
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2235
2258
|
} break;
|