llama_cpp 0.10.3 → 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e679eaf867f62033f7d586a8ef131f2126cb3efb2fde49af7c0be17492a66edf
4
- data.tar.gz: da1e9828c456677dc877db6b9754e961ceff27ecfc93c48abd7624d9bb8cdd29
3
+ metadata.gz: 4db71bfe6290f23102180e5fb7544e4c752ac895f6fefdbb0c1b0b1e52660ebc
4
+ data.tar.gz: 72b1a13ae9c30230eb18eb83feba0c053297a5da22bd99fabee22cf3c1f9ec7d
5
5
  SHA512:
6
- metadata.gz: b1fd0737acaa229493e2cbacc79f5b0b6b91233d40e26b57ab7005945ddba79ea3f44e2cca8a0d75df3695373f8eaa2fdfd4ff766a166a688c051beb2acfb126
7
- data.tar.gz: '01889a0ff9ebabd400fa374066659686ee84d4afab973cdd55b36ce5588bded1ed424a88296c1a26acc413f1e4f98f9f6e36eebaf7f37874b91a335dd147d3f4'
6
+ metadata.gz: 8b7406aaf8d1612678635f686c5770d7ce592596caa5cd5687a66eefa088945f36ef6d40e440a8c29956fc1623c1d2713db9ee37b0f35a0d7d959564c7eb929b
7
+ data.tar.gz: 307e1471b63cbad71d65b816794ca497e9b883bd2430cb9cef6425f7972cbdd93d6ef68aa6358525f1f82a814c40041240df6de3183a669d7d01607ae20d98e7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.10.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.3...v0.10.4)] - 2024-01-06
2
+
3
+ - Bump bundled llama.cpp from b1710 to b1768.
4
+
1
5
  ## [[0.10.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.2...v0.10.3)] - 2023-12-29
2
6
 
3
7
  - Bump bundled llama.cpp from b1686 to b1710.
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2023 Atsushi Tatsuma
3
+ Copyright (c) 2023-2024 Atsushi Tatsuma
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
614
614
  }
615
615
 
616
616
  static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
617
- return true;
617
+ switch (op->op) {
618
+ case GGML_OP_MUL_MAT:
619
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
620
+ default:
621
+ return true;
622
+ }
618
623
 
619
624
  GGML_UNUSED(backend);
620
- GGML_UNUSED(op);
621
625
  }
622
626
 
623
627
  static struct ggml_backend_i cpu_backend_i = {
@@ -119,7 +119,9 @@
119
119
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
120
120
  #define CC_VOLTA 700
121
121
  #define CC_OFFSET_AMD 1000000
122
+ #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
122
123
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
124
+ #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
123
125
 
124
126
  #define GGML_CUDA_MAX_NODES 8192
125
127
 
@@ -133,7 +135,6 @@
133
135
 
134
136
  // TODO: improve this to be correct for more hardware
135
137
  // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
136
- // probably other such cases, and not sure what happens on AMD hardware
137
138
  #if !defined(GGML_CUDA_FORCE_MMQ)
138
139
  #define CUDA_USE_TENSOR_CORES
139
140
  #endif
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
6662
6663
  // pool with virtual memory
6663
6664
  static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6664
6665
  static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6665
- static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
6666
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
6666
6667
 
6667
6668
  static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
6668
6669
  scoped_spin_lock lock(g_cuda_pool_lock);
@@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
7485
7486
  const int64_t ne00 = src0->ne[0];
7486
7487
  const int64_t row_diff = row_high - row_low;
7487
7488
 
7489
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7490
+
7488
7491
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
7489
7492
  #ifdef GGML_CUDA_F16
7490
7493
  cuda_pool_alloc<half> src1_dfloat_a;
@@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
7577
7580
  const int compute_capability = g_device_caps[id].cc;
7578
7581
 
7579
7582
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7583
+ //printf("this branch\n");
7580
7584
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7581
7585
  cuda_pool_alloc<half> src0_as_f16;
7582
7586
  if (src0->type != GGML_TYPE_F16) {
@@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(
7614
7618
 
7615
7619
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7616
7620
  to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
7617
- }
7618
- else {
7621
+ } else {
7619
7622
  cuda_pool_alloc<float> src0_ddq_as_f32;
7623
+ cuda_pool_alloc<float> src1_ddq_as_f32;
7620
7624
 
7621
7625
  if (src0->type != GGML_TYPE_F32) {
7622
7626
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
7624
7628
  src0_ddq_as_f32.alloc(row_diff*ne00);
7625
7629
  to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
7626
7630
  }
7631
+ if (src1->type != GGML_TYPE_F32) {
7632
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
7633
+ GGML_ASSERT(to_fp32_cuda != nullptr);
7634
+ src1_ddq_as_f32.alloc(src1_ncols*ne10);
7635
+ to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
7636
+ }
7637
+
7627
7638
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
7639
+ const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
7628
7640
 
7629
7641
  const float alpha = 1.0f;
7630
7642
  const float beta = 0.0f;
@@ -7633,9 +7645,9 @@ static void ggml_cuda_op_mul_mat_cublas(
7633
7645
  CUBLAS_CHECK(
7634
7646
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7635
7647
  row_diff, src1_ncols, ne10,
7636
- &alpha, src0_ddf_i, ne00,
7637
- src1_ddf_i, ne10,
7638
- &beta, dst_dd_i, ldc));
7648
+ &alpha, src0_ddf_i, ne00,
7649
+ src1_ddf1_i, ne10,
7650
+ &beta, dst_dd_i, ldc));
7639
7651
  }
7640
7652
 
7641
7653
  (void) dst;
@@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(
8035
8047
 
8036
8048
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
8037
8049
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
8050
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
8038
8051
 
8039
8052
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
8040
8053
 
@@ -8481,9 +8494,9 @@ static __global__ void k_compute_batched_ptrs(
8481
8494
  int64_t i03 = i13 / r3;
8482
8495
  int64_t i02 = i12 / r2;
8483
8496
 
8484
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8485
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8486
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8497
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8498
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
8499
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8487
8500
  }
8488
8501
 
8489
8502
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8492
8505
 
8493
8506
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
8494
8507
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8495
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
8496
-
8497
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
8498
- const int64_t ne01 = src0->ne[1];
8499
- const int64_t ne02 = src0->ne[2];
8500
- const int64_t ne03 = src0->ne[3];
8501
8508
 
8502
- const int64_t nb01 = src0->nb[1];
8503
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
8504
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
8505
-
8506
- const int64_t ne10 = src1->ne[0];
8507
- const int64_t ne11 = src1->ne[1];
8508
- const int64_t ne12 = src1->ne[2];
8509
- const int64_t ne13 = src1->ne[3];
8510
-
8511
- const int64_t nb11 = src1->nb[1];
8512
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8513
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8509
+ GGML_TENSOR_BINARY_OP_LOCALS
8514
8510
 
8515
- const int64_t ne1 = ggml_nelements(src1);
8516
- const int64_t ne = ggml_nelements(dst);
8511
+ const int64_t ne_dst = ggml_nelements(dst);
8517
8512
 
8518
8513
  ggml_cuda_set_device(g_main_device);
8519
8514
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8522
8517
 
8523
8518
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8524
8519
  void * src0_ddq = src0_extra->data_device[g_main_device];
8525
- half * src0_as_f16 = (half *) src0_ddq;
8520
+ half * src0_f16 = (half *) src0_ddq;
8526
8521
 
8527
8522
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8528
8523
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8531
8526
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8532
8527
 
8533
8528
  // convert src1 to fp16
8534
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8535
- GGML_ASSERT(to_fp16_cuda != nullptr);
8536
-
8537
- cuda_pool_alloc<half> src1_as_f16(ne1);
8538
- to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
8529
+ cuda_pool_alloc<half> src1_f16_alloc;
8530
+ if (src1->type != GGML_TYPE_F16) {
8531
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8532
+ const int64_t ne_src1 = ggml_nelements(src1);
8533
+ src1_f16_alloc.alloc(ne_src1);
8534
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8535
+ to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
8536
+ }
8537
+ half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
8539
8538
 
8540
8539
  cuda_pool_alloc<half> dst_f16;
8541
8540
  char * dst_t;
@@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8557
8556
  const void * beta = &beta_f16;
8558
8557
 
8559
8558
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8560
- dst_t = (char *) dst_f16.alloc(ne);
8559
+ dst_t = (char *) dst_f16.alloc(ne_dst);
8561
8560
 
8562
8561
  nbd2 /= sizeof(float) / sizeof(half);
8563
8562
  nbd3 /= sizeof(float) / sizeof(half);
@@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8604
8603
  CUBLAS_CHECK(
8605
8604
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8606
8605
  ne01, ne11, ne10,
8607
- alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8608
- (const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8609
- beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8606
+ alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
8607
+ (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
8608
+ beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
8610
8609
  ne12*ne13,
8611
8610
  cu_compute_type,
8612
8611
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8619
8618
 
8620
8619
  dim3 block_dims(ne13, ne12);
8621
8620
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8622
- src0_as_f16, src1_as_f16.get(), dst_t,
8621
+ src0_f16, src1_f16, dst_t,
8623
8622
  ptrs_src.get(), ptrs_dst.get(),
8624
8623
  ne12, ne13,
8625
8624
  ne23,
8626
8625
  nb02, nb03,
8627
- nb12, nb13,
8626
+ src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
8627
+ src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
8628
8628
  nbd2, nbd3,
8629
8629
  r2, r3);
8630
8630
  CUDA_CHECK(cudaGetLastError());
@@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8632
8632
  CUBLAS_CHECK(
8633
8633
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8634
8634
  ne01, ne11, ne10,
8635
- alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8636
- (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8635
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
8636
+ (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
8637
8637
  beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
8638
8638
  ne23,
8639
8639
  cu_compute_type,
@@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8643
8643
 
8644
8644
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8645
8645
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8646
- to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
8646
+ to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
8647
8647
  }
8648
8648
  }
8649
8649
 
@@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8662
8662
  }
8663
8663
  }
8664
8664
 
8665
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8666
+
8667
+ const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
8668
+ bool use_mul_mat_q = ggml_is_quantized(src0->type);
8665
8669
  #ifdef CUDA_USE_TENSOR_CORES
8666
- const bool use_tensor_cores = true;
8670
+ use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
8671
+ #endif // CUDA_USE_TENSOR_CORES
8672
+
8667
8673
  #else
8668
- const bool use_tensor_cores = false;
8669
- #endif
8674
+
8675
+ const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
8676
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8677
+ #ifdef CUDA_USE_TENSOR_CORES
8678
+ // when tensor cores are available, use them for large batch size
8679
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
8680
+ use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
8681
+ #endif // CUDA_USE_TENSOR_CORES
8682
+
8683
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8670
8684
 
8671
8685
  // debug helpers
8672
8686
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8676
8690
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
8677
8691
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
8678
8692
 
8679
- if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
8693
+ if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
8680
8694
  // KQ single-batch
8681
8695
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
8682
- } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8696
+ } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8683
8697
  // KQV single-batch
8684
8698
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
8685
- } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8699
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8686
8700
  // KQ + KQV multi-batch
8687
8701
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
8688
8702
  } else if (src0->type == GGML_TYPE_F32) {
8689
8703
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
8690
8704
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
8691
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
8705
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
8692
8706
  #ifdef GGML_CUDA_FORCE_DMMV
8693
8707
  const bool use_mul_mat_vec_q = false;
8694
8708
  #else
@@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8702
8716
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
8703
8717
  }
8704
8718
  } else {
8705
- bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8706
-
8707
- // when tensor cores are available, use them for large batch size
8708
- // ref: https://github.com/ggerganov/llama.cpp/pull/3776
8709
- if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
8710
- use_mul_mat_q = false;
8711
- }
8712
-
8713
8719
  if (use_mul_mat_q) {
8714
8720
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
8715
8721
  } else {
@@ -10033,14 +10039,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10033
10039
  }
10034
10040
  return false;
10035
10041
  } break;
10042
+ case GGML_OP_DUP:
10043
+ case GGML_OP_REPEAT:
10044
+ case GGML_OP_CONCAT:
10045
+ {
10046
+ ggml_type src0_type = op->src[0]->type;
10047
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
10048
+ } break;
10036
10049
  case GGML_OP_NONE:
10037
10050
  case GGML_OP_RESHAPE:
10038
10051
  case GGML_OP_VIEW:
10039
10052
  case GGML_OP_PERMUTE:
10040
10053
  case GGML_OP_TRANSPOSE:
10041
10054
  case GGML_OP_NORM:
10042
- case GGML_OP_REPEAT:
10043
- case GGML_OP_DUP:
10044
10055
  case GGML_OP_ADD:
10045
10056
  case GGML_OP_MUL:
10046
10057
  case GGML_OP_DIV:
@@ -10057,7 +10068,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10057
10068
  case GGML_OP_SUM_ROWS:
10058
10069
  case GGML_OP_ARGSORT:
10059
10070
  case GGML_OP_ACC:
10060
- case GGML_OP_CONCAT:
10061
10071
  case GGML_OP_GROUP_NORM:
10062
10072
  case GGML_OP_UPSCALE:
10063
10073
  case GGML_OP_PAD:
@@ -5,6 +5,7 @@
5
5
  // GGML internal header
6
6
 
7
7
  #include <assert.h>
8
+ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
8
9
  #include <stddef.h>
9
10
  #include <stdbool.h>
10
11
  #include <string.h> // memcpy
@@ -87,6 +87,7 @@ struct ggml_metal_context {
87
87
  GGML_METAL_DECL_KERNEL(get_rows_q4_K);
88
88
  GGML_METAL_DECL_KERNEL(get_rows_q5_K);
89
89
  GGML_METAL_DECL_KERNEL(get_rows_q6_K);
90
+ GGML_METAL_DECL_KERNEL(get_rows_i32);
90
91
  GGML_METAL_DECL_KERNEL(rms_norm);
91
92
  GGML_METAL_DECL_KERNEL(group_norm);
92
93
  GGML_METAL_DECL_KERNEL(norm);
@@ -257,13 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
257
258
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
258
259
  #endif
259
260
  NSError * error = nil;
260
- NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
261
+ NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
261
262
  if (libPath != nil) {
263
+ // pre-compiled library found
262
264
  NSURL * libURL = [NSURL fileURLWithPath:libPath];
263
265
  GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
264
266
  ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
265
267
  } else {
266
- GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
268
+ GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
267
269
 
268
270
  NSString * sourcePath;
269
271
  NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -291,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
291
293
  options = [MTLCompileOptions new];
292
294
  options.preprocessorMacros = @{ @"QK_K" : @(64) };
293
295
  #endif
296
+ // try to disable fast-math
297
+ // NOTE: this seems to have no effect whatsoever
298
+ // instead, in order to disable fast-math, we have to build ggml.metallib from the command line
299
+ // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
300
+ // and go through the "pre-compiled library found" path above
301
+ //[options setFastMathEnabled:false];
302
+
294
303
  ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
295
304
  }
296
305
 
@@ -369,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
369
378
  GGML_METAL_ADD_KERNEL(get_rows_q4_K);
370
379
  GGML_METAL_ADD_KERNEL(get_rows_q5_K);
371
380
  GGML_METAL_ADD_KERNEL(get_rows_q6_K);
381
+ GGML_METAL_ADD_KERNEL(get_rows_i32);
372
382
  GGML_METAL_ADD_KERNEL(rms_norm);
373
383
  GGML_METAL_ADD_KERNEL(group_norm);
374
384
  GGML_METAL_ADD_KERNEL(norm);
@@ -491,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
491
501
  GGML_METAL_DEL_KERNEL(get_rows_q4_K);
492
502
  GGML_METAL_DEL_KERNEL(get_rows_q5_K);
493
503
  GGML_METAL_DEL_KERNEL(get_rows_q6_K);
504
+ GGML_METAL_DEL_KERNEL(get_rows_i32);
494
505
  GGML_METAL_DEL_KERNEL(rms_norm);
495
506
  GGML_METAL_DEL_KERNEL(group_norm);
496
507
  GGML_METAL_DEL_KERNEL(norm);
@@ -1230,7 +1241,7 @@ void ggml_metal_graph_compute(
1230
1241
  // not sure how to avoid this
1231
1242
  // TODO: make a simpler cpy_bytes kernel
1232
1243
 
1233
- const int nth = MIN(1024, ne00);
1244
+ const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
1234
1245
 
1235
1246
  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
1236
1247
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1285,7 +1296,7 @@ void ggml_metal_graph_compute(
1285
1296
  [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
1286
1297
  [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
1287
1298
 
1288
- const int nth = MIN(1024, ne0);
1299
+ const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
1289
1300
 
1290
1301
  [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1291
1302
  } break;
@@ -1649,6 +1660,10 @@ void ggml_metal_graph_compute(
1649
1660
  }
1650
1661
  };
1651
1662
 
1663
+ if (ggml_is_quantized(src0t)) {
1664
+ GGML_ASSERT(ne00 >= nth0*nth1);
1665
+ }
1666
+
1652
1667
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1653
1668
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1654
1669
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@@ -1707,6 +1722,9 @@ void ggml_metal_graph_compute(
1707
1722
  // TODO: make this more general
1708
1723
  GGML_ASSERT(n_as <= 8);
1709
1724
 
1725
+ // max size of the src1ids array in the kernel stack
1726
+ GGML_ASSERT(ne11 <= 512);
1727
+
1710
1728
  struct ggml_tensor * src2 = gf->nodes[i]->src[2];
1711
1729
 
1712
1730
  const int64_t ne20 = src2 ? src2->ne[0] : 0;
@@ -1724,9 +1742,6 @@ void ggml_metal_graph_compute(
1724
1742
  GGML_ASSERT(!ggml_is_transposed(src2));
1725
1743
  GGML_ASSERT(!ggml_is_transposed(src1));
1726
1744
 
1727
- GGML_ASSERT(ne20 % 32 == 0);
1728
- // !!!!!!!!! TODO: this assert is probably required but not sure!
1729
- //GGML_ASSERT(ne20 >= 64);
1730
1745
  GGML_ASSERT(src1t == GGML_TYPE_F32);
1731
1746
 
1732
1747
  const uint r2 = ne12/ne22;
@@ -1734,22 +1749,22 @@ void ggml_metal_graph_compute(
1734
1749
 
1735
1750
  // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1736
1751
  // to the matrix-vector kernel
1737
- int ne11_mm_min = 1;
1752
+ int ne11_mm_min = n_as;
1738
1753
 
1739
1754
  const int idx = ((int32_t *) dst->op_params)[0];
1740
1755
 
1741
1756
  // batch size
1742
1757
  GGML_ASSERT(ne01 == ne11);
1743
1758
 
1744
- const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
1745
-
1746
1759
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1747
1760
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
1748
1761
  // !!!
1749
1762
  // TODO: for now, always use mat-vec kernels until we figure out how to improve the
1750
1763
  // indirect matrix multiplication
1751
1764
  // !!!
1752
- if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
1765
+ if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
1766
+ ne20 % 32 == 0 && ne20 >= 64 &&
1767
+ ne11 > ne11_mm_min) {
1753
1768
  switch (src2->type) {
1754
1769
  case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
1755
1770
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
@@ -1779,14 +1794,15 @@ void ggml_metal_graph_compute(
1779
1794
  [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
1780
1795
  [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
1781
1796
  [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
1782
- [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14];
1797
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
1783
1798
  [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
1784
1799
  [encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
1785
1800
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
1786
1801
  [encoder setBytes:&idx length:sizeof(idx) atIndex:18];
1787
1802
  // TODO: how to make this an array? read Metal docs
1788
- for (int j = 0; j < n_as; ++j) {
1789
- struct ggml_tensor * src_cur = dst->src[2 + j];
1803
+ for (int j = 0; j < 8; ++j) {
1804
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1805
+ struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
1790
1806
 
1791
1807
  size_t offs_src_cur = 0;
1792
1808
  id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1796,8 +1812,7 @@ void ggml_metal_graph_compute(
1796
1812
 
1797
1813
  [encoder setThreadgroupMemoryLength:8192 atIndex:0];
1798
1814
 
1799
- // TODO: processing one row at a time (ne11 -> 1) is not efficient
1800
- [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1815
+ [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1801
1816
  } else {
1802
1817
  int nth0 = 32;
1803
1818
  int nth1 = 1;
@@ -1880,11 +1895,17 @@ void ggml_metal_graph_compute(
1880
1895
  } break;
1881
1896
  default:
1882
1897
  {
1883
- GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1898
+ GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
1884
1899
  GGML_ASSERT(false && "not implemented");
1885
1900
  }
1886
1901
  };
1887
1902
 
1903
+ if (ggml_is_quantized(src2t)) {
1904
+ GGML_ASSERT(ne20 >= nth0*nth1);
1905
+ }
1906
+
1907
+ const int64_t _ne1 = 1; // kernels needs a reference in constant memory
1908
+
1888
1909
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1889
1910
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1890
1911
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@@ -1909,8 +1930,9 @@ void ggml_metal_graph_compute(
1909
1930
  [encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
1910
1931
  [encoder setBytes:&idx length:sizeof(idx) atIndex:22];
1911
1932
  // TODO: how to make this an array? read Metal docs
1912
- for (int j = 0; j < n_as; ++j) {
1913
- struct ggml_tensor * src_cur = dst->src[2 + j];
1933
+ for (int j = 0; j < 8; ++j) {
1934
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1935
+ struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
1914
1936
 
1915
1937
  size_t offs_src_cur = 0;
1916
1938
  id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1959,6 +1981,7 @@ void ggml_metal_graph_compute(
1959
1981
  case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
1960
1982
  case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
1961
1983
  case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
1984
+ case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
1962
1985
  default: GGML_ASSERT(false && "not implemented");
1963
1986
  }
1964
1987
 
@@ -2229,7 +2252,7 @@ void ggml_metal_graph_compute(
2229
2252
  [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
2230
2253
  [encoder setBytes:&sf length:sizeof(sf) atIndex:18];
2231
2254
 
2232
- const int nth = MIN(1024, ne0);
2255
+ const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
2233
2256
 
2234
2257
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2235
2258
  } break;