llama_cpp 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/LICENSE.txt +1 -1
  4. data/ext/llama_cpp/extconf.rb +35 -110
  5. data/ext/llama_cpp/llama_cpp.cpp +52 -28
  6. data/lib/llama_cpp/version.rb +2 -2
  7. data/sig/llama_cpp.rbs +3 -1
  8. data/vendor/include/.gitkeep +0 -0
  9. data/vendor/lib/.gitkeep +0 -0
  10. data/vendor/tmp/llama.cpp/Makefile +758 -0
  11. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
  12. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
  13. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
  14. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
  15. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
  16. data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
  17. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
  18. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
  19. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
  20. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
  21. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
  22. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
  23. metadata +30 -27
  24. data/ext/llama_cpp/src/ggml-opencl.h +0 -25
  25. data/ext/llama_cpp/src/llama-util.h +0 -546
  26. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
  27. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
  28. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
  29. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
  30. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
  31. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
  32. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
  33. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
  34. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
  35. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
  36. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
  37. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
614
614
  }
615
615
 
616
616
  static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
617
- return true;
617
+ switch (op->op) {
618
+ case GGML_OP_MUL_MAT:
619
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
620
+ default:
621
+ return true;
622
+ }
618
623
 
619
624
  GGML_UNUSED(backend);
620
- GGML_UNUSED(op);
621
625
  }
622
626
 
623
627
  static struct ggml_backend_i cpu_backend_i = {
@@ -119,7 +119,9 @@
119
119
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
120
120
  #define CC_VOLTA 700
121
121
  #define CC_OFFSET_AMD 1000000
122
+ #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
122
123
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
124
+ #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
123
125
 
124
126
  #define GGML_CUDA_MAX_NODES 8192
125
127
 
@@ -133,7 +135,6 @@
133
135
 
134
136
  // TODO: improve this to be correct for more hardware
135
137
  // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
136
- // probably other such cases, and not sure what happens on AMD hardware
137
138
  #if !defined(GGML_CUDA_FORCE_MMQ)
138
139
  #define CUDA_USE_TENSOR_CORES
139
140
  #endif
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
6662
6663
  // pool with virtual memory
6663
6664
  static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6664
6665
  static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6665
- static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
6666
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
6666
6667
 
6667
6668
  static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
6668
6669
  scoped_spin_lock lock(g_cuda_pool_lock);
@@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
7485
7486
  const int64_t ne00 = src0->ne[0];
7486
7487
  const int64_t row_diff = row_high - row_low;
7487
7488
 
7489
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7490
+
7488
7491
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
7489
7492
  #ifdef GGML_CUDA_F16
7490
7493
  cuda_pool_alloc<half> src1_dfloat_a;
@@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
7577
7580
  const int compute_capability = g_device_caps[id].cc;
7578
7581
 
7579
7582
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7583
+ //printf("this branch\n");
7580
7584
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7581
7585
  cuda_pool_alloc<half> src0_as_f16;
7582
7586
  if (src0->type != GGML_TYPE_F16) {
@@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(
7614
7618
 
7615
7619
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7616
7620
  to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
7617
- }
7618
- else {
7621
+ } else {
7619
7622
  cuda_pool_alloc<float> src0_ddq_as_f32;
7623
+ cuda_pool_alloc<float> src1_ddq_as_f32;
7620
7624
 
7621
7625
  if (src0->type != GGML_TYPE_F32) {
7622
7626
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
7624
7628
  src0_ddq_as_f32.alloc(row_diff*ne00);
7625
7629
  to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
7626
7630
  }
7631
+ if (src1->type != GGML_TYPE_F32) {
7632
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
7633
+ GGML_ASSERT(to_fp32_cuda != nullptr);
7634
+ src1_ddq_as_f32.alloc(src1_ncols*ne10);
7635
+ to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
7636
+ }
7637
+
7627
7638
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
7639
+ const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
7628
7640
 
7629
7641
  const float alpha = 1.0f;
7630
7642
  const float beta = 0.0f;
@@ -7633,9 +7645,9 @@ static void ggml_cuda_op_mul_mat_cublas(
7633
7645
  CUBLAS_CHECK(
7634
7646
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7635
7647
  row_diff, src1_ncols, ne10,
7636
- &alpha, src0_ddf_i, ne00,
7637
- src1_ddf_i, ne10,
7638
- &beta, dst_dd_i, ldc));
7648
+ &alpha, src0_ddf_i, ne00,
7649
+ src1_ddf1_i, ne10,
7650
+ &beta, dst_dd_i, ldc));
7639
7651
  }
7640
7652
 
7641
7653
  (void) dst;
@@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(
8035
8047
 
8036
8048
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
8037
8049
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
8050
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
8038
8051
 
8039
8052
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
8040
8053
 
@@ -8481,9 +8494,9 @@ static __global__ void k_compute_batched_ptrs(
8481
8494
  int64_t i03 = i13 / r3;
8482
8495
  int64_t i02 = i12 / r2;
8483
8496
 
8484
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8485
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8486
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8497
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8498
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
8499
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8487
8500
  }
8488
8501
 
8489
8502
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8492
8505
 
8493
8506
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
8494
8507
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8495
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
8496
-
8497
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
8498
- const int64_t ne01 = src0->ne[1];
8499
- const int64_t ne02 = src0->ne[2];
8500
- const int64_t ne03 = src0->ne[3];
8501
8508
 
8502
- const int64_t nb01 = src0->nb[1];
8503
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
8504
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
8505
-
8506
- const int64_t ne10 = src1->ne[0];
8507
- const int64_t ne11 = src1->ne[1];
8508
- const int64_t ne12 = src1->ne[2];
8509
- const int64_t ne13 = src1->ne[3];
8510
-
8511
- const int64_t nb11 = src1->nb[1];
8512
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8513
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8509
+ GGML_TENSOR_BINARY_OP_LOCALS
8514
8510
 
8515
- const int64_t ne1 = ggml_nelements(src1);
8516
- const int64_t ne = ggml_nelements(dst);
8511
+ const int64_t ne_dst = ggml_nelements(dst);
8517
8512
 
8518
8513
  ggml_cuda_set_device(g_main_device);
8519
8514
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8522
8517
 
8523
8518
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8524
8519
  void * src0_ddq = src0_extra->data_device[g_main_device];
8525
- half * src0_as_f16 = (half *) src0_ddq;
8520
+ half * src0_f16 = (half *) src0_ddq;
8526
8521
 
8527
8522
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8528
8523
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8531
8526
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8532
8527
 
8533
8528
  // convert src1 to fp16
8534
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8535
- GGML_ASSERT(to_fp16_cuda != nullptr);
8536
-
8537
- cuda_pool_alloc<half> src1_as_f16(ne1);
8538
- to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
8529
+ cuda_pool_alloc<half> src1_f16_alloc;
8530
+ if (src1->type != GGML_TYPE_F16) {
8531
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8532
+ const int64_t ne_src1 = ggml_nelements(src1);
8533
+ src1_f16_alloc.alloc(ne_src1);
8534
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8535
+ to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
8536
+ }
8537
+ half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
8539
8538
 
8540
8539
  cuda_pool_alloc<half> dst_f16;
8541
8540
  char * dst_t;
@@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8557
8556
  const void * beta = &beta_f16;
8558
8557
 
8559
8558
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8560
- dst_t = (char *) dst_f16.alloc(ne);
8559
+ dst_t = (char *) dst_f16.alloc(ne_dst);
8561
8560
 
8562
8561
  nbd2 /= sizeof(float) / sizeof(half);
8563
8562
  nbd3 /= sizeof(float) / sizeof(half);
@@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8604
8603
  CUBLAS_CHECK(
8605
8604
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8606
8605
  ne01, ne11, ne10,
8607
- alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8608
- (const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8609
- beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8606
+ alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
8607
+ (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
8608
+ beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
8610
8609
  ne12*ne13,
8611
8610
  cu_compute_type,
8612
8611
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8619
8618
 
8620
8619
  dim3 block_dims(ne13, ne12);
8621
8620
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8622
- src0_as_f16, src1_as_f16.get(), dst_t,
8621
+ src0_f16, src1_f16, dst_t,
8623
8622
  ptrs_src.get(), ptrs_dst.get(),
8624
8623
  ne12, ne13,
8625
8624
  ne23,
8626
8625
  nb02, nb03,
8627
- nb12, nb13,
8626
+ src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
8627
+ src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
8628
8628
  nbd2, nbd3,
8629
8629
  r2, r3);
8630
8630
  CUDA_CHECK(cudaGetLastError());
@@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8632
8632
  CUBLAS_CHECK(
8633
8633
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8634
8634
  ne01, ne11, ne10,
8635
- alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8636
- (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8635
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
8636
+ (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
8637
8637
  beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
8638
8638
  ne23,
8639
8639
  cu_compute_type,
@@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8643
8643
 
8644
8644
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8645
8645
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8646
- to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
8646
+ to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
8647
8647
  }
8648
8648
  }
8649
8649
 
@@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8662
8662
  }
8663
8663
  }
8664
8664
 
8665
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8666
+
8667
+ const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
8668
+ bool use_mul_mat_q = ggml_is_quantized(src0->type);
8665
8669
  #ifdef CUDA_USE_TENSOR_CORES
8666
- const bool use_tensor_cores = true;
8670
+ use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
8671
+ #endif // CUDA_USE_TENSOR_CORES
8672
+
8667
8673
  #else
8668
- const bool use_tensor_cores = false;
8669
- #endif
8674
+
8675
+ const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
8676
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8677
+ #ifdef CUDA_USE_TENSOR_CORES
8678
+ // when tensor cores are available, use them for large batch size
8679
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
8680
+ use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
8681
+ #endif // CUDA_USE_TENSOR_CORES
8682
+
8683
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8670
8684
 
8671
8685
  // debug helpers
8672
8686
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8676
8690
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
8677
8691
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
8678
8692
 
8679
- if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
8693
+ if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
8680
8694
  // KQ single-batch
8681
8695
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
8682
- } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8696
+ } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8683
8697
  // KQV single-batch
8684
8698
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
8685
- } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8699
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8686
8700
  // KQ + KQV multi-batch
8687
8701
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
8688
8702
  } else if (src0->type == GGML_TYPE_F32) {
8689
8703
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
8690
8704
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
8691
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
8705
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
8692
8706
  #ifdef GGML_CUDA_FORCE_DMMV
8693
8707
  const bool use_mul_mat_vec_q = false;
8694
8708
  #else
@@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8702
8716
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
8703
8717
  }
8704
8718
  } else {
8705
- bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8706
-
8707
- // when tensor cores are available, use them for large batch size
8708
- // ref: https://github.com/ggerganov/llama.cpp/pull/3776
8709
- if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
8710
- use_mul_mat_q = false;
8711
- }
8712
-
8713
8719
  if (use_mul_mat_q) {
8714
8720
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
8715
8721
  } else {
@@ -10033,14 +10039,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10033
10039
  }
10034
10040
  return false;
10035
10041
  } break;
10042
+ case GGML_OP_DUP:
10043
+ case GGML_OP_REPEAT:
10044
+ case GGML_OP_CONCAT:
10045
+ {
10046
+ ggml_type src0_type = op->src[0]->type;
10047
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
10048
+ } break;
10036
10049
  case GGML_OP_NONE:
10037
10050
  case GGML_OP_RESHAPE:
10038
10051
  case GGML_OP_VIEW:
10039
10052
  case GGML_OP_PERMUTE:
10040
10053
  case GGML_OP_TRANSPOSE:
10041
10054
  case GGML_OP_NORM:
10042
- case GGML_OP_REPEAT:
10043
- case GGML_OP_DUP:
10044
10055
  case GGML_OP_ADD:
10045
10056
  case GGML_OP_MUL:
10046
10057
  case GGML_OP_DIV:
@@ -10057,7 +10068,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10057
10068
  case GGML_OP_SUM_ROWS:
10058
10069
  case GGML_OP_ARGSORT:
10059
10070
  case GGML_OP_ACC:
10060
- case GGML_OP_CONCAT:
10061
10071
  case GGML_OP_GROUP_NORM:
10062
10072
  case GGML_OP_UPSCALE:
10063
10073
  case GGML_OP_PAD:
@@ -5,6 +5,7 @@
5
5
  // GGML internal header
6
6
 
7
7
  #include <assert.h>
8
+ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
8
9
  #include <stddef.h>
9
10
  #include <stdbool.h>
10
11
  #include <string.h> // memcpy
@@ -87,6 +87,7 @@ struct ggml_metal_context {
87
87
  GGML_METAL_DECL_KERNEL(get_rows_q4_K);
88
88
  GGML_METAL_DECL_KERNEL(get_rows_q5_K);
89
89
  GGML_METAL_DECL_KERNEL(get_rows_q6_K);
90
+ GGML_METAL_DECL_KERNEL(get_rows_i32);
90
91
  GGML_METAL_DECL_KERNEL(rms_norm);
91
92
  GGML_METAL_DECL_KERNEL(group_norm);
92
93
  GGML_METAL_DECL_KERNEL(norm);
@@ -257,13 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
257
258
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
258
259
  #endif
259
260
  NSError * error = nil;
260
- NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
261
+ NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
261
262
  if (libPath != nil) {
263
+ // pre-compiled library found
262
264
  NSURL * libURL = [NSURL fileURLWithPath:libPath];
263
265
  GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
264
266
  ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
265
267
  } else {
266
- GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
268
+ GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
267
269
 
268
270
  NSString * sourcePath;
269
271
  NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -291,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
291
293
  options = [MTLCompileOptions new];
292
294
  options.preprocessorMacros = @{ @"QK_K" : @(64) };
293
295
  #endif
296
+ // try to disable fast-math
297
+ // NOTE: this seems to have no effect whatsoever
298
+ // instead, in order to disable fast-math, we have to build ggml.metallib from the command line
299
+ // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
300
+ // and go through the "pre-compiled library found" path above
301
+ //[options setFastMathEnabled:false];
302
+
294
303
  ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
295
304
  }
296
305
 
@@ -369,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
369
378
  GGML_METAL_ADD_KERNEL(get_rows_q4_K);
370
379
  GGML_METAL_ADD_KERNEL(get_rows_q5_K);
371
380
  GGML_METAL_ADD_KERNEL(get_rows_q6_K);
381
+ GGML_METAL_ADD_KERNEL(get_rows_i32);
372
382
  GGML_METAL_ADD_KERNEL(rms_norm);
373
383
  GGML_METAL_ADD_KERNEL(group_norm);
374
384
  GGML_METAL_ADD_KERNEL(norm);
@@ -491,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
491
501
  GGML_METAL_DEL_KERNEL(get_rows_q4_K);
492
502
  GGML_METAL_DEL_KERNEL(get_rows_q5_K);
493
503
  GGML_METAL_DEL_KERNEL(get_rows_q6_K);
504
+ GGML_METAL_DEL_KERNEL(get_rows_i32);
494
505
  GGML_METAL_DEL_KERNEL(rms_norm);
495
506
  GGML_METAL_DEL_KERNEL(group_norm);
496
507
  GGML_METAL_DEL_KERNEL(norm);
@@ -1230,7 +1241,7 @@ void ggml_metal_graph_compute(
1230
1241
  // not sure how to avoid this
1231
1242
  // TODO: make a simpler cpy_bytes kernel
1232
1243
 
1233
- const int nth = MIN(1024, ne00);
1244
+ const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
1234
1245
 
1235
1246
  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
1236
1247
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1285,7 +1296,7 @@ void ggml_metal_graph_compute(
1285
1296
  [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
1286
1297
  [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
1287
1298
 
1288
- const int nth = MIN(1024, ne0);
1299
+ const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
1289
1300
 
1290
1301
  [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1291
1302
  } break;
@@ -1649,6 +1660,10 @@ void ggml_metal_graph_compute(
1649
1660
  }
1650
1661
  };
1651
1662
 
1663
+ if (ggml_is_quantized(src0t)) {
1664
+ GGML_ASSERT(ne00 >= nth0*nth1);
1665
+ }
1666
+
1652
1667
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1653
1668
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1654
1669
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@@ -1707,6 +1722,9 @@ void ggml_metal_graph_compute(
1707
1722
  // TODO: make this more general
1708
1723
  GGML_ASSERT(n_as <= 8);
1709
1724
 
1725
+ // max size of the src1ids array in the kernel stack
1726
+ GGML_ASSERT(ne11 <= 512);
1727
+
1710
1728
  struct ggml_tensor * src2 = gf->nodes[i]->src[2];
1711
1729
 
1712
1730
  const int64_t ne20 = src2 ? src2->ne[0] : 0;
@@ -1724,9 +1742,6 @@ void ggml_metal_graph_compute(
1724
1742
  GGML_ASSERT(!ggml_is_transposed(src2));
1725
1743
  GGML_ASSERT(!ggml_is_transposed(src1));
1726
1744
 
1727
- GGML_ASSERT(ne20 % 32 == 0);
1728
- // !!!!!!!!! TODO: this assert is probably required but not sure!
1729
- //GGML_ASSERT(ne20 >= 64);
1730
1745
  GGML_ASSERT(src1t == GGML_TYPE_F32);
1731
1746
 
1732
1747
  const uint r2 = ne12/ne22;
@@ -1734,22 +1749,22 @@ void ggml_metal_graph_compute(
1734
1749
 
1735
1750
  // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1736
1751
  // to the matrix-vector kernel
1737
- int ne11_mm_min = 1;
1752
+ int ne11_mm_min = n_as;
1738
1753
 
1739
1754
  const int idx = ((int32_t *) dst->op_params)[0];
1740
1755
 
1741
1756
  // batch size
1742
1757
  GGML_ASSERT(ne01 == ne11);
1743
1758
 
1744
- const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
1745
-
1746
1759
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1747
1760
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
1748
1761
  // !!!
1749
1762
  // TODO: for now, always use mat-vec kernels until we figure out how to improve the
1750
1763
  // indirect matrix multiplication
1751
1764
  // !!!
1752
- if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
1765
+ if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
1766
+ ne20 % 32 == 0 && ne20 >= 64 &&
1767
+ ne11 > ne11_mm_min) {
1753
1768
  switch (src2->type) {
1754
1769
  case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
1755
1770
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
@@ -1779,14 +1794,15 @@ void ggml_metal_graph_compute(
1779
1794
  [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
1780
1795
  [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
1781
1796
  [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
1782
- [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14];
1797
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
1783
1798
  [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
1784
1799
  [encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
1785
1800
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
1786
1801
  [encoder setBytes:&idx length:sizeof(idx) atIndex:18];
1787
1802
  // TODO: how to make this an array? read Metal docs
1788
- for (int j = 0; j < n_as; ++j) {
1789
- struct ggml_tensor * src_cur = dst->src[2 + j];
1803
+ for (int j = 0; j < 8; ++j) {
1804
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1805
+ struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
1790
1806
 
1791
1807
  size_t offs_src_cur = 0;
1792
1808
  id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1796,8 +1812,7 @@ void ggml_metal_graph_compute(
1796
1812
 
1797
1813
  [encoder setThreadgroupMemoryLength:8192 atIndex:0];
1798
1814
 
1799
- // TODO: processing one row at a time (ne11 -> 1) is not efficient
1800
- [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1815
+ [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1801
1816
  } else {
1802
1817
  int nth0 = 32;
1803
1818
  int nth1 = 1;
@@ -1880,11 +1895,17 @@ void ggml_metal_graph_compute(
1880
1895
  } break;
1881
1896
  default:
1882
1897
  {
1883
- GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1898
+ GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
1884
1899
  GGML_ASSERT(false && "not implemented");
1885
1900
  }
1886
1901
  };
1887
1902
 
1903
+ if (ggml_is_quantized(src2t)) {
1904
+ GGML_ASSERT(ne20 >= nth0*nth1);
1905
+ }
1906
+
1907
+ const int64_t _ne1 = 1; // kernels needs a reference in constant memory
1908
+
1888
1909
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1889
1910
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1890
1911
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@@ -1909,8 +1930,9 @@ void ggml_metal_graph_compute(
1909
1930
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
1910
1931
  [encoder setBytes:&idx length:sizeof(idx) atIndex:22];
1911
1932
  // TODO: how to make this an array? read Metal docs
1912
- for (int j = 0; j < n_as; ++j) {
1913
- struct ggml_tensor * src_cur = dst->src[2 + j];
1933
+ for (int j = 0; j < 8; ++j) {
1934
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1935
+ struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
1914
1936
 
1915
1937
  size_t offs_src_cur = 0;
1916
1938
  id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1959,6 +1981,7 @@ void ggml_metal_graph_compute(
1959
1981
  case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
1960
1982
  case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
1961
1983
  case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
1984
+ case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
1962
1985
  default: GGML_ASSERT(false && "not implemented");
1963
1986
  }
1964
1987
 
@@ -2229,7 +2252,7 @@ void ggml_metal_graph_compute(
2229
2252
  [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
2230
2253
  [encoder setBytes:&sf length:sizeof(sf) atIndex:18];
2231
2254
 
2232
- const int nth = MIN(1024, ne0);
2255
+ const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
2233
2256
 
2234
2257
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2235
2258
  } break;