llama_cpp 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -45
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-metal.m +4 -3
- data/ext/llama_cpp/src/ggml.c +78 -252
- data/ext/llama_cpp/src/ggml.h +5 -0
- data/ext/llama_cpp/src/llama.cpp +113 -81
- data/ext/llama_cpp/src/llama.h +5 -5
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66c53ea31dd93cc684d6bbc5331bb7e9f12abe2a23e6e16b8f8a3407e62961a0
|
4
|
+
data.tar.gz: 723d4f1d879c314d1733c84411e39d470f619a22be6a17d589406e831d8ea97b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bee0ffe56796ec8bf6240178246c7c95c38ec7cec2bd29f61c1cd85e1230291751c13da850c330fca644089ee2ff524a767b132b5bc6658e95205114e7399ba4
|
7
|
+
data.tar.gz: 382d05658c0a0d8df1c03dcaf93c8861bff3326e1d1e0c0cb3b0638f38cc3de5d36990b1f4df6d0bf3ce19337e9507cd5a2d196d893d8baf56d9b38a49738bc2
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.1...v0.9.2)] - 2023-11-11
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1472 to b1500.
|
4
|
+
|
1
5
|
## [[0.9.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.0...v0.9.1)] - 2023-11-03
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1429 to b1472
|
@@ -378,9 +378,13 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378
378
|
}
|
379
379
|
}
|
380
380
|
|
381
|
-
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
381
|
+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
|
382
382
|
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383
|
-
|
383
|
+
|
384
|
+
if (update_backend) {
|
385
|
+
view->backend = view->view_src->backend;
|
386
|
+
}
|
387
|
+
|
384
388
|
view->buffer = view->view_src->buffer;
|
385
389
|
view->data = (char *)view->view_src->data + view->view_offs;
|
386
390
|
|
@@ -394,7 +398,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
394
398
|
struct hash_node * ht = alloc->hash_table;
|
395
399
|
if (node->data == NULL) {
|
396
400
|
if (ggml_is_view(node)) {
|
397
|
-
init_view(alloc, node);
|
401
|
+
init_view(alloc, node, true);
|
398
402
|
} else {
|
399
403
|
// see if we can reuse a parent's buffer (inplace)
|
400
404
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -424,15 +428,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
424
428
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
425
429
|
node->view_src = view_src;
|
426
430
|
view_src_hn->n_views += 1;
|
427
|
-
init_view(alloc, node);
|
431
|
+
init_view(alloc, node, false);
|
428
432
|
return;
|
429
433
|
}
|
430
|
-
}
|
431
|
-
else {
|
434
|
+
} else {
|
432
435
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
433
436
|
node->view_src = parent;
|
434
437
|
p_hn->n_views += 1;
|
435
|
-
init_view(alloc, node);
|
438
|
+
init_view(alloc, node, false);
|
436
439
|
return;
|
437
440
|
}
|
438
441
|
}
|
@@ -463,7 +466,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|
463
466
|
hash_get(ht, view_src)->n_views += 1;
|
464
467
|
if (node->buffer == NULL && node->data != NULL) {
|
465
468
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
466
|
-
init_view(alloc, node);
|
469
|
+
init_view(alloc, node, true);
|
467
470
|
}
|
468
471
|
}
|
469
472
|
|
@@ -474,7 +477,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|
474
477
|
}
|
475
478
|
hash_get(ht, parent)->n_children += 1;
|
476
479
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477
|
-
init_view(alloc, parent);
|
480
|
+
init_view(alloc, parent, true);
|
478
481
|
}
|
479
482
|
}
|
480
483
|
}
|
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
982
982
|
|
983
983
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
984
984
|
|
985
|
-
const int row = blockIdx.
|
985
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
986
986
|
if (row > nrows) return;
|
987
987
|
|
988
988
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1086
1086
|
|
1087
1087
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1088
1088
|
|
1089
|
-
const int row = blockIdx.
|
1089
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1090
1090
|
if (row > nrows) return;
|
1091
1091
|
|
1092
1092
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1190
1190
|
|
1191
1191
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1192
1192
|
|
1193
|
-
const int row = blockIdx.
|
1193
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1194
1194
|
if (row > nrows) return;
|
1195
1195
|
const int num_blocks_per_row = ncols / QK_K;
|
1196
1196
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1444
1444
|
|
1445
1445
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1446
1446
|
|
1447
|
-
const int row = blockIdx.
|
1447
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1448
1448
|
if (row > nrows) return;
|
1449
1449
|
|
1450
1450
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
|
|
4254
4254
|
|
4255
4255
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4256
4256
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4257
|
-
const int row = blockIdx.
|
4257
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4258
4258
|
|
4259
4259
|
if (row >= nrows) {
|
4260
4260
|
return;
|
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4294
4294
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4295
4295
|
// qk = quantized weights per x block
|
4296
4296
|
// qr = number of quantized weights per data value in x block
|
4297
|
-
const int row = blockIdx.
|
4297
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4298
4298
|
|
4299
4299
|
if (row >= nrows) {
|
4300
4300
|
return;
|
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4867
4867
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4868
4868
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4869
4869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4870
|
-
|
4870
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4871
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4871
4872
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4872
4873
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4873
4874
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4876
4877
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4877
4878
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4878
4879
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4879
|
-
const dim3 block_nums(
|
4880
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4880
4881
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4881
4882
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4882
4883
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4885
4886
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4886
4887
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4887
4888
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4888
|
-
const dim3 block_nums(
|
4889
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4889
4890
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4890
4891
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4891
4892
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4894
4895
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4895
4896
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4896
4897
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4897
|
-
const dim3 block_nums(
|
4898
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4898
4899
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4899
4900
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4900
4901
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4903
4904
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4904
4905
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4905
4906
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4906
|
-
const dim3 block_nums(
|
4907
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4907
4908
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4908
4909
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4909
4910
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4913
4914
|
GGML_ASSERT(ncols % QK_K == 0);
|
4914
4915
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4915
4916
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4916
|
-
const dim3 block_nums(
|
4917
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4917
4918
|
const dim3 block_dims(32, ny, 1);
|
4918
4919
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4919
4920
|
}
|
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4922
4923
|
GGML_ASSERT(ncols % QK_K == 0);
|
4923
4924
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4924
4925
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4925
|
-
const dim3 block_nums(
|
4926
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4926
4927
|
const dim3 block_dims(32, ny, 1);
|
4927
4928
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4928
4929
|
}
|
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4931
4932
|
GGML_ASSERT(ncols % QK_K == 0);
|
4932
4933
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4933
4934
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4934
|
-
const dim3 block_nums(
|
4935
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4935
4936
|
const dim3 block_dims(32, ny, 1);
|
4936
4937
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4937
4938
|
}
|
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4946
4947
|
GGML_ASSERT(ncols % QK_K == 0);
|
4947
4948
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4948
4949
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4949
|
-
const dim3 block_nums(
|
4950
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4950
4951
|
const dim3 block_dims(32, ny, 1);
|
4951
4952
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4952
4953
|
}
|
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4954
4955
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4955
4956
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4956
4957
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4957
|
-
const dim3 block_nums(
|
4958
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4958
4959
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4959
4960
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4960
4961
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4963
4964
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4964
4965
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4965
4966
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4966
|
-
const dim3 block_nums(
|
4967
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4967
4968
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4968
4969
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4969
4970
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4972
4973
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4973
4974
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4974
4975
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4975
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4976
4977
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4977
4978
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4978
4979
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4981
4982
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4982
4983
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4983
4984
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4984
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4985
4986
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4986
4987
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4987
4988
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4990
4991
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4991
4992
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4992
4993
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4993
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4994
4995
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4995
4996
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4996
4997
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4999
5000
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5000
5001
|
GGML_ASSERT(ncols % QK_K == 0);
|
5001
5002
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5002
|
-
const dim3 block_nums(
|
5003
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5003
5004
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5004
5005
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
5005
5006
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5008
5009
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5009
5010
|
GGML_ASSERT(ncols % QK_K == 0);
|
5010
5011
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5011
|
-
const dim3 block_nums(
|
5012
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5012
5013
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5013
5014
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
5014
5015
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5017
5018
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5018
5019
|
GGML_ASSERT(ncols % QK_K == 0);
|
5019
5020
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5020
|
-
const dim3 block_nums(
|
5021
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5021
5022
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5022
5023
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
5023
5024
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5026
5027
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5027
5028
|
GGML_ASSERT(ncols % QK_K == 0);
|
5028
5029
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5029
|
-
const dim3 block_nums(
|
5030
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5030
5031
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5031
5032
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
5032
5033
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5035
5036
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5036
5037
|
GGML_ASSERT(ncols % QK_K == 0);
|
5037
5038
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5038
|
-
const dim3 block_nums(
|
5039
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5039
5040
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5040
5041
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
5041
5042
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
5054
5055
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5055
5056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5056
5057
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5057
|
-
const dim3 block_nums(
|
5058
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5058
5059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5059
5060
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5060
5061
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5789,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5789
5790
|
CUDA_CHECK(cudaFree(ptr));
|
5790
5791
|
}
|
5791
5792
|
|
5793
|
+
static bool g_cublas_loaded = false;
|
5794
|
+
|
5795
|
+
bool ggml_cublas_loaded(void) {
|
5796
|
+
return g_cublas_loaded;
|
5797
|
+
}
|
5792
5798
|
|
5793
5799
|
void ggml_init_cublas() {
|
5794
5800
|
static bool initialized = false;
|
@@ -5802,7 +5808,12 @@ void ggml_init_cublas() {
|
|
5802
5808
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5803
5809
|
#endif
|
5804
5810
|
|
5805
|
-
|
5811
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5812
|
+
initialized = true;
|
5813
|
+
g_cublas_loaded = false;
|
5814
|
+
return;
|
5815
|
+
}
|
5816
|
+
|
5806
5817
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5807
5818
|
int64_t total_vram = 0;
|
5808
5819
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
@@ -5850,6 +5861,7 @@ void ggml_init_cublas() {
|
|
5850
5861
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5851
5862
|
|
5852
5863
|
initialized = true;
|
5864
|
+
g_cublas_loaded = true;
|
5853
5865
|
}
|
5854
5866
|
}
|
5855
5867
|
|
@@ -6892,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6892
6904
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6893
6905
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6894
6906
|
|
6907
|
+
int used_devices = 0;
|
6908
|
+
|
6895
6909
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6896
6910
|
// by default, use all rows
|
6897
6911
|
row_low[id] = 0;
|
@@ -6919,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6919
6933
|
continue;
|
6920
6934
|
}
|
6921
6935
|
|
6936
|
+
used_devices++;
|
6937
|
+
|
6922
6938
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6923
6939
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6924
6940
|
|
@@ -6957,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6957
6973
|
|
6958
6974
|
// if multiple devices are used they need to wait for the main device
|
6959
6975
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6960
|
-
if (split &&
|
6976
|
+
if (split && used_devices > 1) {
|
6961
6977
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6962
6978
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6963
6979
|
}
|
6964
6980
|
|
6965
|
-
const int64_t src1_col_stride = split &&
|
6981
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6966
6982
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6967
6983
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6968
6984
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -7078,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7078
7094
|
}
|
7079
7095
|
|
7080
7096
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7097
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7098
|
+
continue;
|
7099
|
+
}
|
7081
7100
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7082
7101
|
|
7083
7102
|
// free buffers again when done
|
@@ -7102,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7102
7121
|
|
7103
7122
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7104
7123
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7124
|
+
if (row_low[id] == row_high[id]) {
|
7125
|
+
continue;
|
7126
|
+
}
|
7105
7127
|
for (int64_t is = 0; is < is_max; ++is) {
|
7106
7128
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
7107
7129
|
}
|
@@ -7147,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7147
7169
|
}
|
7148
7170
|
|
7149
7171
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
+
if (!g_cublas_loaded) return false;
|
7173
|
+
|
7150
7174
|
const int64_t ne10 = src1->ne[0];
|
7151
7175
|
|
7152
7176
|
const int64_t ne0 = dst->ne[0];
|
@@ -7225,7 +7249,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7225
7249
|
|
7226
7250
|
__global__ void k_compute_batched_ptrs(
|
7227
7251
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
-
void **
|
7252
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7229
7253
|
int ne12, int ne13,
|
7230
7254
|
int ne23,
|
7231
7255
|
int nb02, int nb03,
|
@@ -7242,9 +7266,9 @@ __global__ void k_compute_batched_ptrs(
|
|
7242
7266
|
int i03 = i13 / r3;
|
7243
7267
|
int i02 = i12 / r2;
|
7244
7268
|
|
7245
|
-
|
7246
|
-
|
7247
|
-
|
7269
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7270
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7271
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
7272
|
}
|
7249
7273
|
|
7250
7274
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -7350,14 +7374,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7350
7374
|
// use cublasGemmBatchedEx
|
7351
7375
|
const int ne23 = ne12*ne13;
|
7352
7376
|
|
7353
|
-
void **
|
7354
|
-
|
7355
|
-
|
7377
|
+
const void ** ptrs_src = nullptr;
|
7378
|
+
void ** ptrs_dst = nullptr;
|
7379
|
+
|
7380
|
+
size_t ptrs_src_s = 0;
|
7381
|
+
size_t ptrs_dst_s = 0;
|
7382
|
+
|
7383
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7384
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7356
7385
|
|
7357
7386
|
dim3 block_dims(ne13, ne12);
|
7358
7387
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
7388
|
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
-
|
7389
|
+
ptrs_src, ptrs_dst,
|
7361
7390
|
ne12, ne13,
|
7362
7391
|
ne23,
|
7363
7392
|
nb02, nb03,
|
@@ -7369,14 +7398,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7369
7398
|
CUBLAS_CHECK(
|
7370
7399
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
7400
|
ne01, ne11, ne10,
|
7372
|
-
&alpha_f16, (const void
|
7373
|
-
(const void
|
7374
|
-
&beta_f16, ( void **
|
7401
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7402
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7403
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7375
7404
|
ne23,
|
7376
7405
|
CUBLAS_COMPUTE_16F,
|
7377
7406
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
7407
|
|
7379
|
-
|
7408
|
+
if (ptrs_src_s != 0) {
|
7409
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7410
|
+
}
|
7411
|
+
if (ptrs_dst_s != 0) {
|
7412
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7413
|
+
}
|
7380
7414
|
}
|
7381
7415
|
#endif
|
7382
7416
|
|
@@ -7389,10 +7423,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7389
7423
|
|
7390
7424
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7391
7425
|
const bool all_on_device =
|
7392
|
-
(src0->backend == GGML_BACKEND_GPU) &&
|
7426
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7393
7427
|
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
7428
|
( dst->backend == GGML_BACKEND_GPU);
|
7395
7429
|
|
7430
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7431
|
+
|
7396
7432
|
int64_t min_compute_capability = INT_MAX;
|
7397
7433
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7398
7434
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
@@ -7414,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7414
7450
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
7451
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
7452
|
|
7417
|
-
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7453
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
7454
|
// KQ single-batch
|
7419
7455
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7420
|
-
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7456
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
7457
|
// KQV single-batch
|
7422
7458
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
-
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7459
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
7460
|
// KQ + KQV multi-batch
|
7425
7461
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7426
7462
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -7820,6 +7856,8 @@ void ggml_cuda_free_scratch() {
|
|
7820
7856
|
}
|
7821
7857
|
|
7822
7858
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
+
if (!g_cublas_loaded) return false;
|
7860
|
+
|
7823
7861
|
ggml_cuda_func_t func;
|
7824
7862
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7825
7863
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
@@ -17,7 +17,12 @@ extern "C" {
|
|
17
17
|
|
18
18
|
#define GGML_CUDA_MAX_DEVICES 16
|
19
19
|
|
20
|
+
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
20
21
|
GGML_API void ggml_init_cublas(void);
|
22
|
+
|
23
|
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
24
|
+
GGML_API bool ggml_cublas_loaded(void);
|
25
|
+
|
21
26
|
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
22
27
|
GGML_API void ggml_cuda_host_free(void * ptr);
|
23
28
|
|
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
|
|
1017
1017
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
1018
1018
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
1019
1019
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
1020
|
-
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
1020
|
+
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
|
1021
1021
|
|
1022
1022
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1023
1023
|
} break;
|
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
|
|
1348
1348
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
1349
1349
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
1350
1350
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
1351
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
1351
|
+
[encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
|
1352
1352
|
|
1353
1353
|
const int64_t nrows = ggml_nrows(src0);
|
1354
1354
|
|
@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
|
|
1403
1403
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
1404
1404
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1405
1405
|
const int mode = ((int32_t *) dst->op_params)[2];
|
1406
|
-
|
1406
|
+
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
1407
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
1407
1408
|
|
1408
1409
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1409
1410
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|