llama_cpp 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dae7507ce41f18e3fd0fb2d7445275a387a3914068aa9eef922f260de699970a
4
- data.tar.gz: d66cc2629aeca3285bc10988f8c410fb8cf5b7f1fe6f835b5dc60e9dcab4be9d
3
+ metadata.gz: 66c53ea31dd93cc684d6bbc5331bb7e9f12abe2a23e6e16b8f8a3407e62961a0
4
+ data.tar.gz: 723d4f1d879c314d1733c84411e39d470f619a22be6a17d589406e831d8ea97b
5
5
  SHA512:
6
- metadata.gz: 3e3e92aa38413877620947ec7996494cd720a3c211fcdf1973ce0d7a9a7e8803e293e2ce2f601b11e35858c5b4ef6b00d716069e322ea8d6b4c93412990fd746
7
- data.tar.gz: 20a1e9e0e5812da9b00787afbf0f3aa0b762c8168f54ce3b7f2f25ff5b61cca5b2e7ab5faa065fbc3e266468d1c5747b8e0779fc7e073cc66240d1f3085e71c7
6
+ metadata.gz: bee0ffe56796ec8bf6240178246c7c95c38ec7cec2bd29f61c1cd85e1230291751c13da850c330fca644089ee2ff524a767b132b5bc6658e95205114e7399ba4
7
+ data.tar.gz: 382d05658c0a0d8df1c03dcaf93c8861bff3326e1d1e0c0cb3b0638f38cc3de5d36990b1f4df6d0bf3ce19337e9507cd5a2d196d893d8baf56d9b38a49738bc2
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.1...v0.9.2)] - 2023-11-11
2
+
3
+ - Bump bundled llama.cpp from b1472 to b1500.
4
+
1
5
  ## [[0.9.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.0...v0.9.1)] - 2023-11-03
2
6
 
3
7
  - Bump bundled llama.cpp from b1429 to b1472
@@ -378,9 +378,13 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
378
  }
379
379
  }
380
380
 
381
- static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
382
382
  assert(view->view_src != NULL && view->view_src->data != NULL);
383
- view->backend = view->view_src->backend;
383
+
384
+ if (update_backend) {
385
+ view->backend = view->view_src->backend;
386
+ }
387
+
384
388
  view->buffer = view->view_src->buffer;
385
389
  view->data = (char *)view->view_src->data + view->view_offs;
386
390
 
@@ -394,7 +398,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
394
398
  struct hash_node * ht = alloc->hash_table;
395
399
  if (node->data == NULL) {
396
400
  if (ggml_is_view(node)) {
397
- init_view(alloc, node);
401
+ init_view(alloc, node, true);
398
402
  } else {
399
403
  // see if we can reuse a parent's buffer (inplace)
400
404
  if (ggml_op_can_inplace(node->op)) {
@@ -424,15 +428,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
424
428
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
425
429
  node->view_src = view_src;
426
430
  view_src_hn->n_views += 1;
427
- init_view(alloc, node);
431
+ init_view(alloc, node, false);
428
432
  return;
429
433
  }
430
- }
431
- else {
434
+ } else {
432
435
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
433
436
  node->view_src = parent;
434
437
  p_hn->n_views += 1;
435
- init_view(alloc, node);
438
+ init_view(alloc, node, false);
436
439
  return;
437
440
  }
438
441
  }
@@ -463,7 +466,7 @@ size_t ggml_allocr_alloc_graph_n(
463
466
  hash_get(ht, view_src)->n_views += 1;
464
467
  if (node->buffer == NULL && node->data != NULL) {
465
468
  // view of a pre-allocated tensor, didn't call init_view() yet
466
- init_view(alloc, node);
469
+ init_view(alloc, node, true);
467
470
  }
468
471
  }
469
472
 
@@ -474,7 +477,7 @@ size_t ggml_allocr_alloc_graph_n(
474
477
  }
475
478
  hash_get(ht, parent)->n_children += 1;
476
479
  if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
- init_view(alloc, parent);
480
+ init_view(alloc, parent, true);
478
481
  }
479
482
  }
480
483
  }
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
982
982
 
983
983
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
984
984
 
985
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
985
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
986
986
  if (row > nrows) return;
987
987
 
988
988
  const int num_blocks_per_row = ncols / QK_K;
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1086
1086
 
1087
1087
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1088
1088
 
1089
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1089
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1090
1090
  if (row > nrows) return;
1091
1091
 
1092
1092
  const int num_blocks_per_row = ncols / QK_K;
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1190
1190
 
1191
1191
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1192
1192
 
1193
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1193
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1194
1194
  if (row > nrows) return;
1195
1195
  const int num_blocks_per_row = ncols / QK_K;
1196
1196
  const int ib0 = row*num_blocks_per_row;
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1444
1444
 
1445
1445
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1446
1446
 
1447
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1447
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1448
1448
  if (row > nrows) return;
1449
1449
 
1450
1450
  const int num_blocks_per_row = ncols / QK_K;
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
4254
4254
 
4255
4255
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4256
4256
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4257
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4257
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4258
4258
 
4259
4259
  if (row >= nrows) {
4260
4260
  return;
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4294
4294
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4295
4295
  // qk = quantized weights per x block
4296
4296
  // qr = number of quantized weights per data value in x block
4297
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4297
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4298
4298
 
4299
4299
  if (row >= nrows) {
4300
4300
  return;
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4867
4867
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4868
4868
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4869
4869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4870
- const dim3 block_nums(1, block_num_y, 1);
4870
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4871
+ const dim3 block_nums(block_num_y, 1, 1);
4871
4872
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4872
4873
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4873
4874
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4876
4877
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4877
4878
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4878
4879
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4879
- const dim3 block_nums(1, block_num_y, 1);
4880
+ const dim3 block_nums(block_num_y, 1, 1);
4880
4881
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4881
4882
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4882
4883
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4885
4886
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4886
4887
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4887
4888
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4888
- const dim3 block_nums(1, block_num_y, 1);
4889
+ const dim3 block_nums(block_num_y, 1, 1);
4889
4890
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4890
4891
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4891
4892
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4894
4895
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4895
4896
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4896
4897
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4897
- const dim3 block_nums(1, block_num_y, 1);
4898
+ const dim3 block_nums(block_num_y, 1, 1);
4898
4899
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4899
4900
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4900
4901
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4903
4904
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4904
4905
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4905
4906
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4906
- const dim3 block_nums(1, block_num_y, 1);
4907
+ const dim3 block_nums(block_num_y, 1, 1);
4907
4908
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4908
4909
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4909
4910
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4913
4914
  GGML_ASSERT(ncols % QK_K == 0);
4914
4915
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4915
4916
  const int block_num_y = (nrows + ny - 1) / ny;
4916
- const dim3 block_nums(1, block_num_y, 1);
4917
+ const dim3 block_nums(block_num_y, 1, 1);
4917
4918
  const dim3 block_dims(32, ny, 1);
4918
4919
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4919
4920
  }
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4922
4923
  GGML_ASSERT(ncols % QK_K == 0);
4923
4924
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4924
4925
  const int block_num_y = (nrows + ny - 1) / ny;
4925
- const dim3 block_nums(1, block_num_y, 1);
4926
+ const dim3 block_nums(block_num_y, 1, 1);
4926
4927
  const dim3 block_dims(32, ny, 1);
4927
4928
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4928
4929
  }
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4931
4932
  GGML_ASSERT(ncols % QK_K == 0);
4932
4933
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4933
4934
  const int block_num_y = (nrows + ny - 1) / ny;
4934
- const dim3 block_nums(1, block_num_y, 1);
4935
+ const dim3 block_nums(block_num_y, 1, 1);
4935
4936
  const dim3 block_dims(32, ny, 1);
4936
4937
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4937
4938
  }
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4946
4947
  GGML_ASSERT(ncols % QK_K == 0);
4947
4948
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4948
4949
  const int block_num_y = (nrows + ny - 1) / ny;
4949
- const dim3 block_nums(1, block_num_y, 1);
4950
+ const dim3 block_nums(block_num_y, 1, 1);
4950
4951
  const dim3 block_dims(32, ny, 1);
4951
4952
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4952
4953
  }
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4954
4955
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4955
4956
  GGML_ASSERT(ncols % QK4_0 == 0);
4956
4957
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4957
- const dim3 block_nums(1, block_num_y, 1);
4958
+ const dim3 block_nums(block_num_y, 1, 1);
4958
4959
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4959
4960
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4960
4961
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4963
4964
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4964
4965
  GGML_ASSERT(ncols % QK4_1 == 0);
4965
4966
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4966
- const dim3 block_nums(1, block_num_y, 1);
4967
+ const dim3 block_nums(block_num_y, 1, 1);
4967
4968
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4968
4969
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4969
4970
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4972
4973
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4973
4974
  GGML_ASSERT(ncols % QK5_0 == 0);
4974
4975
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4975
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4976
4977
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4977
4978
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4978
4979
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4981
4982
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4982
4983
  GGML_ASSERT(ncols % QK5_1 == 0);
4983
4984
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4984
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4985
4986
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4986
4987
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4987
4988
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4990
4991
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4991
4992
  GGML_ASSERT(ncols % QK8_0 == 0);
4992
4993
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4993
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4994
4995
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4995
4996
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4996
4997
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4999
5000
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5000
5001
  GGML_ASSERT(ncols % QK_K == 0);
5001
5002
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5002
- const dim3 block_nums(1, block_num_y, 1);
5003
+ const dim3 block_nums(block_num_y, 1, 1);
5003
5004
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5004
5005
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
5005
5006
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
5008
5009
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5009
5010
  GGML_ASSERT(ncols % QK_K == 0);
5010
5011
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5011
- const dim3 block_nums(1, block_num_y, 1);
5012
+ const dim3 block_nums(block_num_y, 1, 1);
5012
5013
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5013
5014
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
5014
5015
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
5017
5018
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5018
5019
  GGML_ASSERT(ncols % QK_K == 0);
5019
5020
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5020
- const dim3 block_nums(1, block_num_y, 1);
5021
+ const dim3 block_nums(block_num_y, 1, 1);
5021
5022
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5022
5023
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
5023
5024
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
5026
5027
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5027
5028
  GGML_ASSERT(ncols % QK_K == 0);
5028
5029
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5029
- const dim3 block_nums(1, block_num_y, 1);
5030
+ const dim3 block_nums(block_num_y, 1, 1);
5030
5031
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5031
5032
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
5032
5033
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
5035
5036
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5036
5037
  GGML_ASSERT(ncols % QK_K == 0);
5037
5038
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5038
- const dim3 block_nums(1, block_num_y, 1);
5039
+ const dim3 block_nums(block_num_y, 1, 1);
5039
5040
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5040
5041
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
5041
5042
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
5054
5055
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5055
5056
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5056
5057
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5057
- const dim3 block_nums(1, block_num_y, 1);
5058
+ const dim3 block_nums(block_num_y, 1, 1);
5058
5059
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5059
5060
  dequantize_mul_mat_vec<1, 1, convert_f16>
5060
5061
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5789,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5789
5790
  CUDA_CHECK(cudaFree(ptr));
5790
5791
  }
5791
5792
 
5793
+ static bool g_cublas_loaded = false;
5794
+
5795
+ bool ggml_cublas_loaded(void) {
5796
+ return g_cublas_loaded;
5797
+ }
5792
5798
 
5793
5799
  void ggml_init_cublas() {
5794
5800
  static bool initialized = false;
@@ -5802,7 +5808,12 @@ void ggml_init_cublas() {
5802
5808
  CUDA_CHECK(cudaDeviceSynchronize());
5803
5809
  #endif
5804
5810
 
5805
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5811
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5812
+ initialized = true;
5813
+ g_cublas_loaded = false;
5814
+ return;
5815
+ }
5816
+
5806
5817
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5807
5818
  int64_t total_vram = 0;
5808
5819
  #if defined(GGML_CUDA_FORCE_MMQ)
@@ -5850,6 +5861,7 @@ void ggml_init_cublas() {
5850
5861
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5851
5862
 
5852
5863
  initialized = true;
5864
+ g_cublas_loaded = true;
5853
5865
  }
5854
5866
  }
5855
5867
 
@@ -6892,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
6892
6904
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6893
6905
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6894
6906
 
6907
+ int used_devices = 0;
6908
+
6895
6909
  for (int64_t id = 0; id < g_device_count; ++id) {
6896
6910
  // by default, use all rows
6897
6911
  row_low[id] = 0;
@@ -6919,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
6919
6933
  continue;
6920
6934
  }
6921
6935
 
6936
+ used_devices++;
6937
+
6922
6938
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6923
6939
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6924
6940
 
@@ -6957,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
6957
6973
 
6958
6974
  // if multiple devices are used they need to wait for the main device
6959
6975
  // here an event is recorded that signals that the main device has finished calculating the input data
6960
- if (split && g_device_count > 1) {
6976
+ if (split && used_devices > 1) {
6961
6977
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6962
6978
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6963
6979
  }
6964
6980
 
6965
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6981
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6966
6982
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6967
6983
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6968
6984
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7078,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
7078
7094
  }
7079
7095
 
7080
7096
  for (int64_t id = 0; id < g_device_count; ++id) {
7097
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7098
+ continue;
7099
+ }
7081
7100
  CUDA_CHECK(ggml_cuda_set_device(id));
7082
7101
 
7083
7102
  // free buffers again when done
@@ -7102,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
7102
7121
 
7103
7122
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7104
7123
  for (int64_t id = 0; id < g_device_count; ++id) {
7124
+ if (row_low[id] == row_high[id]) {
7125
+ continue;
7126
+ }
7105
7127
  for (int64_t is = 0; is < is_max; ++is) {
7106
7128
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
7107
7129
  }
@@ -7147,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7147
7169
  }
7148
7170
 
7149
7171
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
+ if (!g_cublas_loaded) return false;
7173
+
7150
7174
  const int64_t ne10 = src1->ne[0];
7151
7175
 
7152
7176
  const int64_t ne0 = dst->ne[0];
@@ -7225,7 +7249,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7225
7249
 
7226
7250
  __global__ void k_compute_batched_ptrs(
7227
7251
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
- void ** ptrs,
7252
+ const void ** ptrs_src, void ** ptrs_dst,
7229
7253
  int ne12, int ne13,
7230
7254
  int ne23,
7231
7255
  int nb02, int nb03,
@@ -7242,9 +7266,9 @@ __global__ void k_compute_batched_ptrs(
7242
7266
  int i03 = i13 / r3;
7243
7267
  int i02 = i12 / r2;
7244
7268
 
7245
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7269
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7270
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7271
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
7272
  }
7249
7273
 
7250
7274
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7350,14 +7374,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7350
7374
  // use cublasGemmBatchedEx
7351
7375
  const int ne23 = ne12*ne13;
7352
7376
 
7353
- void ** ptrs_as = nullptr;
7354
- size_t ptrs_s = 0;
7355
- ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7377
+ const void ** ptrs_src = nullptr;
7378
+ void ** ptrs_dst = nullptr;
7379
+
7380
+ size_t ptrs_src_s = 0;
7381
+ size_t ptrs_dst_s = 0;
7382
+
7383
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7384
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7356
7385
 
7357
7386
  dim3 block_dims(ne13, ne12);
7358
7387
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
7388
  src0_as_f16, src1_as_f16, dst_f16,
7360
- ptrs_as,
7389
+ ptrs_src, ptrs_dst,
7361
7390
  ne12, ne13,
7362
7391
  ne23,
7363
7392
  nb02, nb03,
@@ -7369,14 +7398,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7369
7398
  CUBLAS_CHECK(
7370
7399
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
7400
  ne01, ne11, ne10,
7372
- &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
- (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
- &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7401
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7402
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7403
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7375
7404
  ne23,
7376
7405
  CUBLAS_COMPUTE_16F,
7377
7406
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
7407
 
7379
- ggml_cuda_pool_free(ptrs_as, ptrs_s);
7408
+ if (ptrs_src_s != 0) {
7409
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7410
+ }
7411
+ if (ptrs_dst_s != 0) {
7412
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7413
+ }
7380
7414
  }
7381
7415
  #endif
7382
7416
 
@@ -7389,10 +7423,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7389
7423
 
7390
7424
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391
7425
  const bool all_on_device =
7392
- (src0->backend == GGML_BACKEND_GPU) &&
7426
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7393
7427
  (src1->backend == GGML_BACKEND_GPU) &&
7394
7428
  ( dst->backend == GGML_BACKEND_GPU);
7395
7429
 
7430
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7431
+
7396
7432
  int64_t min_compute_capability = INT_MAX;
7397
7433
  for (int64_t id = 0; id < g_device_count; ++id) {
7398
7434
  if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7414,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7414
7450
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
7451
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
7452
 
7417
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7453
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
7454
  // KQ single-batch
7419
7455
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7420
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7456
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
7457
  // KQV single-batch
7422
7458
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
- } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7459
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
7460
  // KQ + KQV multi-batch
7425
7461
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7426
7462
  } else if (src0->type == GGML_TYPE_F32) {
@@ -7820,6 +7856,8 @@ void ggml_cuda_free_scratch() {
7820
7856
  }
7821
7857
 
7822
7858
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
+ if (!g_cublas_loaded) return false;
7860
+
7823
7861
  ggml_cuda_func_t func;
7824
7862
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7825
7863
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
@@ -17,7 +17,12 @@ extern "C" {
17
17
 
18
18
  #define GGML_CUDA_MAX_DEVICES 16
19
19
 
20
+ // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
20
21
  GGML_API void ggml_init_cublas(void);
22
+
23
+ // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
+ GGML_API bool ggml_cublas_loaded(void);
25
+
21
26
  GGML_API void * ggml_cuda_host_malloc(size_t size);
22
27
  GGML_API void ggml_cuda_host_free(void * ptr);
23
28
 
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
1017
1017
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1018
1018
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1019
1019
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1020
- [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
1020
+ [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
1021
1021
 
1022
1022
  [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1023
1023
  } break;
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
1348
1348
  [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1349
1349
  [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1350
1350
  [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1351
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
1351
+ [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
1352
1352
 
1353
1353
  const int64_t nrows = ggml_nrows(src0);
1354
1354
 
@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
1403
1403
  const int n_past = ((int32_t *) dst->op_params)[0];
1404
1404
  const int n_dims = ((int32_t *) dst->op_params)[1];
1405
1405
  const int mode = ((int32_t *) dst->op_params)[2];
1406
- const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
1406
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
1407
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1407
1408
 
1408
1409
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1409
1410
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));