llama_cpp 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dae7507ce41f18e3fd0fb2d7445275a387a3914068aa9eef922f260de699970a
4
- data.tar.gz: d66cc2629aeca3285bc10988f8c410fb8cf5b7f1fe6f835b5dc60e9dcab4be9d
3
+ metadata.gz: 66c53ea31dd93cc684d6bbc5331bb7e9f12abe2a23e6e16b8f8a3407e62961a0
4
+ data.tar.gz: 723d4f1d879c314d1733c84411e39d470f619a22be6a17d589406e831d8ea97b
5
5
  SHA512:
6
- metadata.gz: 3e3e92aa38413877620947ec7996494cd720a3c211fcdf1973ce0d7a9a7e8803e293e2ce2f601b11e35858c5b4ef6b00d716069e322ea8d6b4c93412990fd746
7
- data.tar.gz: 20a1e9e0e5812da9b00787afbf0f3aa0b762c8168f54ce3b7f2f25ff5b61cca5b2e7ab5faa065fbc3e266468d1c5747b8e0779fc7e073cc66240d1f3085e71c7
6
+ metadata.gz: bee0ffe56796ec8bf6240178246c7c95c38ec7cec2bd29f61c1cd85e1230291751c13da850c330fca644089ee2ff524a767b132b5bc6658e95205114e7399ba4
7
+ data.tar.gz: 382d05658c0a0d8df1c03dcaf93c8861bff3326e1d1e0c0cb3b0638f38cc3de5d36990b1f4df6d0bf3ce19337e9507cd5a2d196d893d8baf56d9b38a49738bc2
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.1...v0.9.2)] - 2023-11-11
2
+
3
+ - Bump bundled llama.cpp from b1472 to b1500.
4
+
1
5
  ## [[0.9.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.0...v0.9.1)] - 2023-11-03
2
6
 
3
7
  - Bump bundled llama.cpp from b1429 to b1472
@@ -378,9 +378,13 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
378
  }
379
379
  }
380
380
 
381
- static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
382
382
  assert(view->view_src != NULL && view->view_src->data != NULL);
383
- view->backend = view->view_src->backend;
383
+
384
+ if (update_backend) {
385
+ view->backend = view->view_src->backend;
386
+ }
387
+
384
388
  view->buffer = view->view_src->buffer;
385
389
  view->data = (char *)view->view_src->data + view->view_offs;
386
390
 
@@ -394,7 +398,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
394
398
  struct hash_node * ht = alloc->hash_table;
395
399
  if (node->data == NULL) {
396
400
  if (ggml_is_view(node)) {
397
- init_view(alloc, node);
401
+ init_view(alloc, node, true);
398
402
  } else {
399
403
  // see if we can reuse a parent's buffer (inplace)
400
404
  if (ggml_op_can_inplace(node->op)) {
@@ -424,15 +428,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
424
428
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
425
429
  node->view_src = view_src;
426
430
  view_src_hn->n_views += 1;
427
- init_view(alloc, node);
431
+ init_view(alloc, node, false);
428
432
  return;
429
433
  }
430
- }
431
- else {
434
+ } else {
432
435
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
433
436
  node->view_src = parent;
434
437
  p_hn->n_views += 1;
435
- init_view(alloc, node);
438
+ init_view(alloc, node, false);
436
439
  return;
437
440
  }
438
441
  }
@@ -463,7 +466,7 @@ size_t ggml_allocr_alloc_graph_n(
463
466
  hash_get(ht, view_src)->n_views += 1;
464
467
  if (node->buffer == NULL && node->data != NULL) {
465
468
  // view of a pre-allocated tensor, didn't call init_view() yet
466
- init_view(alloc, node);
469
+ init_view(alloc, node, true);
467
470
  }
468
471
  }
469
472
 
@@ -474,7 +477,7 @@ size_t ggml_allocr_alloc_graph_n(
474
477
  }
475
478
  hash_get(ht, parent)->n_children += 1;
476
479
  if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
- init_view(alloc, parent);
480
+ init_view(alloc, parent, true);
478
481
  }
479
482
  }
480
483
  }
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
982
982
 
983
983
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
984
984
 
985
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
985
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
986
986
  if (row > nrows) return;
987
987
 
988
988
  const int num_blocks_per_row = ncols / QK_K;
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1086
1086
 
1087
1087
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1088
1088
 
1089
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1089
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1090
1090
  if (row > nrows) return;
1091
1091
 
1092
1092
  const int num_blocks_per_row = ncols / QK_K;
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1190
1190
 
1191
1191
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1192
1192
 
1193
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1193
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1194
1194
  if (row > nrows) return;
1195
1195
  const int num_blocks_per_row = ncols / QK_K;
1196
1196
  const int ib0 = row*num_blocks_per_row;
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1444
1444
 
1445
1445
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1446
1446
 
1447
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1447
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1448
1448
  if (row > nrows) return;
1449
1449
 
1450
1450
  const int num_blocks_per_row = ncols / QK_K;
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
4254
4254
 
4255
4255
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4256
4256
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4257
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4257
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4258
4258
 
4259
4259
  if (row >= nrows) {
4260
4260
  return;
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4294
4294
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4295
4295
  // qk = quantized weights per x block
4296
4296
  // qr = number of quantized weights per data value in x block
4297
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4297
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4298
4298
 
4299
4299
  if (row >= nrows) {
4300
4300
  return;
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4867
4867
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4868
4868
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4869
4869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4870
- const dim3 block_nums(1, block_num_y, 1);
4870
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4871
+ const dim3 block_nums(block_num_y, 1, 1);
4871
4872
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4872
4873
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4873
4874
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4876
4877
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4877
4878
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4878
4879
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4879
- const dim3 block_nums(1, block_num_y, 1);
4880
+ const dim3 block_nums(block_num_y, 1, 1);
4880
4881
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4881
4882
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4882
4883
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4885
4886
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4886
4887
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4887
4888
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4888
- const dim3 block_nums(1, block_num_y, 1);
4889
+ const dim3 block_nums(block_num_y, 1, 1);
4889
4890
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4890
4891
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4891
4892
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4894
4895
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4895
4896
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4896
4897
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4897
- const dim3 block_nums(1, block_num_y, 1);
4898
+ const dim3 block_nums(block_num_y, 1, 1);
4898
4899
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4899
4900
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4900
4901
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4903
4904
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4904
4905
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4905
4906
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4906
- const dim3 block_nums(1, block_num_y, 1);
4907
+ const dim3 block_nums(block_num_y, 1, 1);
4907
4908
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4908
4909
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4909
4910
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4913
4914
  GGML_ASSERT(ncols % QK_K == 0);
4914
4915
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4915
4916
  const int block_num_y = (nrows + ny - 1) / ny;
4916
- const dim3 block_nums(1, block_num_y, 1);
4917
+ const dim3 block_nums(block_num_y, 1, 1);
4917
4918
  const dim3 block_dims(32, ny, 1);
4918
4919
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4919
4920
  }
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4922
4923
  GGML_ASSERT(ncols % QK_K == 0);
4923
4924
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4924
4925
  const int block_num_y = (nrows + ny - 1) / ny;
4925
- const dim3 block_nums(1, block_num_y, 1);
4926
+ const dim3 block_nums(block_num_y, 1, 1);
4926
4927
  const dim3 block_dims(32, ny, 1);
4927
4928
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4928
4929
  }
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4931
4932
  GGML_ASSERT(ncols % QK_K == 0);
4932
4933
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4933
4934
  const int block_num_y = (nrows + ny - 1) / ny;
4934
- const dim3 block_nums(1, block_num_y, 1);
4935
+ const dim3 block_nums(block_num_y, 1, 1);
4935
4936
  const dim3 block_dims(32, ny, 1);
4936
4937
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4937
4938
  }
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4946
4947
  GGML_ASSERT(ncols % QK_K == 0);
4947
4948
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4948
4949
  const int block_num_y = (nrows + ny - 1) / ny;
4949
- const dim3 block_nums(1, block_num_y, 1);
4950
+ const dim3 block_nums(block_num_y, 1, 1);
4950
4951
  const dim3 block_dims(32, ny, 1);
4951
4952
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4952
4953
  }
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4954
4955
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4955
4956
  GGML_ASSERT(ncols % QK4_0 == 0);
4956
4957
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4957
- const dim3 block_nums(1, block_num_y, 1);
4958
+ const dim3 block_nums(block_num_y, 1, 1);
4958
4959
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4959
4960
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4960
4961
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4963
4964
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4964
4965
  GGML_ASSERT(ncols % QK4_1 == 0);
4965
4966
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4966
- const dim3 block_nums(1, block_num_y, 1);
4967
+ const dim3 block_nums(block_num_y, 1, 1);
4967
4968
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4968
4969
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4969
4970
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4972
4973
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4973
4974
  GGML_ASSERT(ncols % QK5_0 == 0);
4974
4975
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4975
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4976
4977
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4977
4978
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4978
4979
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4981
4982
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4982
4983
  GGML_ASSERT(ncols % QK5_1 == 0);
4983
4984
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4984
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4985
4986
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4986
4987
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4987
4988
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4990
4991
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4991
4992
  GGML_ASSERT(ncols % QK8_0 == 0);
4992
4993
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4993
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4994
4995
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4995
4996
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4996
4997
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4999
5000
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5000
5001
  GGML_ASSERT(ncols % QK_K == 0);
5001
5002
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5002
- const dim3 block_nums(1, block_num_y, 1);
5003
+ const dim3 block_nums(block_num_y, 1, 1);
5003
5004
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5004
5005
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
5005
5006
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
5008
5009
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5009
5010
  GGML_ASSERT(ncols % QK_K == 0);
5010
5011
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5011
- const dim3 block_nums(1, block_num_y, 1);
5012
+ const dim3 block_nums(block_num_y, 1, 1);
5012
5013
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5013
5014
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
5014
5015
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
5017
5018
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5018
5019
  GGML_ASSERT(ncols % QK_K == 0);
5019
5020
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5020
- const dim3 block_nums(1, block_num_y, 1);
5021
+ const dim3 block_nums(block_num_y, 1, 1);
5021
5022
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5022
5023
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
5023
5024
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
5026
5027
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5027
5028
  GGML_ASSERT(ncols % QK_K == 0);
5028
5029
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5029
- const dim3 block_nums(1, block_num_y, 1);
5030
+ const dim3 block_nums(block_num_y, 1, 1);
5030
5031
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5031
5032
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
5032
5033
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
5035
5036
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5036
5037
  GGML_ASSERT(ncols % QK_K == 0);
5037
5038
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5038
- const dim3 block_nums(1, block_num_y, 1);
5039
+ const dim3 block_nums(block_num_y, 1, 1);
5039
5040
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5040
5041
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
5041
5042
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
5054
5055
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5055
5056
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5056
5057
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5057
- const dim3 block_nums(1, block_num_y, 1);
5058
+ const dim3 block_nums(block_num_y, 1, 1);
5058
5059
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5059
5060
  dequantize_mul_mat_vec<1, 1, convert_f16>
5060
5061
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5789,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5789
5790
  CUDA_CHECK(cudaFree(ptr));
5790
5791
  }
5791
5792
 
5793
+ static bool g_cublas_loaded = false;
5794
+
5795
+ bool ggml_cublas_loaded(void) {
5796
+ return g_cublas_loaded;
5797
+ }
5792
5798
 
5793
5799
  void ggml_init_cublas() {
5794
5800
  static bool initialized = false;
@@ -5802,7 +5808,12 @@ void ggml_init_cublas() {
5802
5808
  CUDA_CHECK(cudaDeviceSynchronize());
5803
5809
  #endif
5804
5810
 
5805
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5811
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5812
+ initialized = true;
5813
+ g_cublas_loaded = false;
5814
+ return;
5815
+ }
5816
+
5806
5817
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5807
5818
  int64_t total_vram = 0;
5808
5819
  #if defined(GGML_CUDA_FORCE_MMQ)
@@ -5850,6 +5861,7 @@ void ggml_init_cublas() {
5850
5861
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5851
5862
 
5852
5863
  initialized = true;
5864
+ g_cublas_loaded = true;
5853
5865
  }
5854
5866
  }
5855
5867
 
@@ -6892,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
6892
6904
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6893
6905
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6894
6906
 
6907
+ int used_devices = 0;
6908
+
6895
6909
  for (int64_t id = 0; id < g_device_count; ++id) {
6896
6910
  // by default, use all rows
6897
6911
  row_low[id] = 0;
@@ -6919,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
6919
6933
  continue;
6920
6934
  }
6921
6935
 
6936
+ used_devices++;
6937
+
6922
6938
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6923
6939
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6924
6940
 
@@ -6957,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
6957
6973
 
6958
6974
  // if multiple devices are used they need to wait for the main device
6959
6975
  // here an event is recorded that signals that the main device has finished calculating the input data
6960
- if (split && g_device_count > 1) {
6976
+ if (split && used_devices > 1) {
6961
6977
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6962
6978
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6963
6979
  }
6964
6980
 
6965
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6981
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6966
6982
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6967
6983
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6968
6984
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7078,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
7078
7094
  }
7079
7095
 
7080
7096
  for (int64_t id = 0; id < g_device_count; ++id) {
7097
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7098
+ continue;
7099
+ }
7081
7100
  CUDA_CHECK(ggml_cuda_set_device(id));
7082
7101
 
7083
7102
  // free buffers again when done
@@ -7102,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
7102
7121
 
7103
7122
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7104
7123
  for (int64_t id = 0; id < g_device_count; ++id) {
7124
+ if (row_low[id] == row_high[id]) {
7125
+ continue;
7126
+ }
7105
7127
  for (int64_t is = 0; is < is_max; ++is) {
7106
7128
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
7107
7129
  }
@@ -7147,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7147
7169
  }
7148
7170
 
7149
7171
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
+ if (!g_cublas_loaded) return false;
7173
+
7150
7174
  const int64_t ne10 = src1->ne[0];
7151
7175
 
7152
7176
  const int64_t ne0 = dst->ne[0];
@@ -7225,7 +7249,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7225
7249
 
7226
7250
  __global__ void k_compute_batched_ptrs(
7227
7251
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
- void ** ptrs,
7252
+ const void ** ptrs_src, void ** ptrs_dst,
7229
7253
  int ne12, int ne13,
7230
7254
  int ne23,
7231
7255
  int nb02, int nb03,
@@ -7242,9 +7266,9 @@ __global__ void k_compute_batched_ptrs(
7242
7266
  int i03 = i13 / r3;
7243
7267
  int i02 = i12 / r2;
7244
7268
 
7245
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7269
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7270
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7271
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
7272
  }
7249
7273
 
7250
7274
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7350,14 +7374,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7350
7374
  // use cublasGemmBatchedEx
7351
7375
  const int ne23 = ne12*ne13;
7352
7376
 
7353
- void ** ptrs_as = nullptr;
7354
- size_t ptrs_s = 0;
7355
- ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7377
+ const void ** ptrs_src = nullptr;
7378
+ void ** ptrs_dst = nullptr;
7379
+
7380
+ size_t ptrs_src_s = 0;
7381
+ size_t ptrs_dst_s = 0;
7382
+
7383
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7384
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7356
7385
 
7357
7386
  dim3 block_dims(ne13, ne12);
7358
7387
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
7388
  src0_as_f16, src1_as_f16, dst_f16,
7360
- ptrs_as,
7389
+ ptrs_src, ptrs_dst,
7361
7390
  ne12, ne13,
7362
7391
  ne23,
7363
7392
  nb02, nb03,
@@ -7369,14 +7398,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7369
7398
  CUBLAS_CHECK(
7370
7399
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
7400
  ne01, ne11, ne10,
7372
- &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
- (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
- &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7401
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7402
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7403
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7375
7404
  ne23,
7376
7405
  CUBLAS_COMPUTE_16F,
7377
7406
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
7407
 
7379
- ggml_cuda_pool_free(ptrs_as, ptrs_s);
7408
+ if (ptrs_src_s != 0) {
7409
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7410
+ }
7411
+ if (ptrs_dst_s != 0) {
7412
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7413
+ }
7380
7414
  }
7381
7415
  #endif
7382
7416
 
@@ -7389,10 +7423,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7389
7423
 
7390
7424
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391
7425
  const bool all_on_device =
7392
- (src0->backend == GGML_BACKEND_GPU) &&
7426
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7393
7427
  (src1->backend == GGML_BACKEND_GPU) &&
7394
7428
  ( dst->backend == GGML_BACKEND_GPU);
7395
7429
 
7430
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7431
+
7396
7432
  int64_t min_compute_capability = INT_MAX;
7397
7433
  for (int64_t id = 0; id < g_device_count; ++id) {
7398
7434
  if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7414,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7414
7450
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
7451
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
7452
 
7417
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7453
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
7454
  // KQ single-batch
7419
7455
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7420
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7456
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
7457
  // KQV single-batch
7422
7458
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
- } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7459
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
7460
  // KQ + KQV multi-batch
7425
7461
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7426
7462
  } else if (src0->type == GGML_TYPE_F32) {
@@ -7820,6 +7856,8 @@ void ggml_cuda_free_scratch() {
7820
7856
  }
7821
7857
 
7822
7858
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
+ if (!g_cublas_loaded) return false;
7860
+
7823
7861
  ggml_cuda_func_t func;
7824
7862
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7825
7863
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
@@ -17,7 +17,12 @@ extern "C" {
17
17
 
18
18
  #define GGML_CUDA_MAX_DEVICES 16
19
19
 
20
+ // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
20
21
  GGML_API void ggml_init_cublas(void);
22
+
23
+ // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
+ GGML_API bool ggml_cublas_loaded(void);
25
+
21
26
  GGML_API void * ggml_cuda_host_malloc(size_t size);
22
27
  GGML_API void ggml_cuda_host_free(void * ptr);
23
28
 
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
1017
1017
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1018
1018
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1019
1019
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1020
- [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
1020
+ [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
1021
1021
 
1022
1022
  [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1023
1023
  } break;
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
1348
1348
  [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1349
1349
  [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1350
1350
  [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1351
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
1351
+ [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
1352
1352
 
1353
1353
  const int64_t nrows = ggml_nrows(src0);
1354
1354
 
@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
1403
1403
  const int n_past = ((int32_t *) dst->op_params)[0];
1404
1404
  const int n_dims = ((int32_t *) dst->op_params)[1];
1405
1405
  const int mode = ((int32_t *) dst->op_params)[2];
1406
- const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
1406
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
1407
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1407
1408
 
1408
1409
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1409
1410
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));