llama_cpp 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -45
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-metal.m +4 -3
- data/ext/llama_cpp/src/ggml.c +78 -252
- data/ext/llama_cpp/src/ggml.h +5 -0
- data/ext/llama_cpp/src/llama.cpp +113 -81
- data/ext/llama_cpp/src/llama.h +5 -5
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66c53ea31dd93cc684d6bbc5331bb7e9f12abe2a23e6e16b8f8a3407e62961a0
|
4
|
+
data.tar.gz: 723d4f1d879c314d1733c84411e39d470f619a22be6a17d589406e831d8ea97b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bee0ffe56796ec8bf6240178246c7c95c38ec7cec2bd29f61c1cd85e1230291751c13da850c330fca644089ee2ff524a767b132b5bc6658e95205114e7399ba4
|
7
|
+
data.tar.gz: 382d05658c0a0d8df1c03dcaf93c8861bff3326e1d1e0c0cb3b0638f38cc3de5d36990b1f4df6d0bf3ce19337e9507cd5a2d196d893d8baf56d9b38a49738bc2
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.1...v0.9.2)] - 2023-11-11
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1472 to b1500.
|
4
|
+
|
1
5
|
## [[0.9.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.0...v0.9.1)] - 2023-11-03
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1429 to b1472
|
@@ -378,9 +378,13 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378
378
|
}
|
379
379
|
}
|
380
380
|
|
381
|
-
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
381
|
+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
|
382
382
|
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383
|
-
|
383
|
+
|
384
|
+
if (update_backend) {
|
385
|
+
view->backend = view->view_src->backend;
|
386
|
+
}
|
387
|
+
|
384
388
|
view->buffer = view->view_src->buffer;
|
385
389
|
view->data = (char *)view->view_src->data + view->view_offs;
|
386
390
|
|
@@ -394,7 +398,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
394
398
|
struct hash_node * ht = alloc->hash_table;
|
395
399
|
if (node->data == NULL) {
|
396
400
|
if (ggml_is_view(node)) {
|
397
|
-
init_view(alloc, node);
|
401
|
+
init_view(alloc, node, true);
|
398
402
|
} else {
|
399
403
|
// see if we can reuse a parent's buffer (inplace)
|
400
404
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -424,15 +428,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
424
428
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
425
429
|
node->view_src = view_src;
|
426
430
|
view_src_hn->n_views += 1;
|
427
|
-
init_view(alloc, node);
|
431
|
+
init_view(alloc, node, false);
|
428
432
|
return;
|
429
433
|
}
|
430
|
-
}
|
431
|
-
else {
|
434
|
+
} else {
|
432
435
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
433
436
|
node->view_src = parent;
|
434
437
|
p_hn->n_views += 1;
|
435
|
-
init_view(alloc, node);
|
438
|
+
init_view(alloc, node, false);
|
436
439
|
return;
|
437
440
|
}
|
438
441
|
}
|
@@ -463,7 +466,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|
463
466
|
hash_get(ht, view_src)->n_views += 1;
|
464
467
|
if (node->buffer == NULL && node->data != NULL) {
|
465
468
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
466
|
-
init_view(alloc, node);
|
469
|
+
init_view(alloc, node, true);
|
467
470
|
}
|
468
471
|
}
|
469
472
|
|
@@ -474,7 +477,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|
474
477
|
}
|
475
478
|
hash_get(ht, parent)->n_children += 1;
|
476
479
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477
|
-
init_view(alloc, parent);
|
480
|
+
init_view(alloc, parent, true);
|
478
481
|
}
|
479
482
|
}
|
480
483
|
}
|
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
982
982
|
|
983
983
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
984
984
|
|
985
|
-
const int row = blockIdx.
|
985
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
986
986
|
if (row > nrows) return;
|
987
987
|
|
988
988
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1086
1086
|
|
1087
1087
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1088
1088
|
|
1089
|
-
const int row = blockIdx.
|
1089
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1090
1090
|
if (row > nrows) return;
|
1091
1091
|
|
1092
1092
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1190
1190
|
|
1191
1191
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1192
1192
|
|
1193
|
-
const int row = blockIdx.
|
1193
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1194
1194
|
if (row > nrows) return;
|
1195
1195
|
const int num_blocks_per_row = ncols / QK_K;
|
1196
1196
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1444
1444
|
|
1445
1445
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1446
1446
|
|
1447
|
-
const int row = blockIdx.
|
1447
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1448
1448
|
if (row > nrows) return;
|
1449
1449
|
|
1450
1450
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
|
|
4254
4254
|
|
4255
4255
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4256
4256
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4257
|
-
const int row = blockIdx.
|
4257
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4258
4258
|
|
4259
4259
|
if (row >= nrows) {
|
4260
4260
|
return;
|
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4294
4294
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4295
4295
|
// qk = quantized weights per x block
|
4296
4296
|
// qr = number of quantized weights per data value in x block
|
4297
|
-
const int row = blockIdx.
|
4297
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4298
4298
|
|
4299
4299
|
if (row >= nrows) {
|
4300
4300
|
return;
|
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4867
4867
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4868
4868
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4869
4869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4870
|
-
|
4870
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4871
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4871
4872
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4872
4873
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4873
4874
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4876
4877
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4877
4878
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4878
4879
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4879
|
-
const dim3 block_nums(
|
4880
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4880
4881
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4881
4882
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4882
4883
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4885
4886
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4886
4887
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4887
4888
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4888
|
-
const dim3 block_nums(
|
4889
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4889
4890
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4890
4891
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4891
4892
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4894
4895
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4895
4896
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4896
4897
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4897
|
-
const dim3 block_nums(
|
4898
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4898
4899
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4899
4900
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4900
4901
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4903
4904
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4904
4905
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4905
4906
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4906
|
-
const dim3 block_nums(
|
4907
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4907
4908
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4908
4909
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4909
4910
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4913
4914
|
GGML_ASSERT(ncols % QK_K == 0);
|
4914
4915
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4915
4916
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4916
|
-
const dim3 block_nums(
|
4917
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4917
4918
|
const dim3 block_dims(32, ny, 1);
|
4918
4919
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4919
4920
|
}
|
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4922
4923
|
GGML_ASSERT(ncols % QK_K == 0);
|
4923
4924
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4924
4925
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4925
|
-
const dim3 block_nums(
|
4926
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4926
4927
|
const dim3 block_dims(32, ny, 1);
|
4927
4928
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4928
4929
|
}
|
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4931
4932
|
GGML_ASSERT(ncols % QK_K == 0);
|
4932
4933
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4933
4934
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4934
|
-
const dim3 block_nums(
|
4935
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4935
4936
|
const dim3 block_dims(32, ny, 1);
|
4936
4937
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4937
4938
|
}
|
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4946
4947
|
GGML_ASSERT(ncols % QK_K == 0);
|
4947
4948
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4948
4949
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4949
|
-
const dim3 block_nums(
|
4950
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4950
4951
|
const dim3 block_dims(32, ny, 1);
|
4951
4952
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4952
4953
|
}
|
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4954
4955
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4955
4956
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4956
4957
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4957
|
-
const dim3 block_nums(
|
4958
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4958
4959
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4959
4960
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4960
4961
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4963
4964
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4964
4965
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4965
4966
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4966
|
-
const dim3 block_nums(
|
4967
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4967
4968
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4968
4969
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4969
4970
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4972
4973
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4973
4974
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4974
4975
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4975
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4976
4977
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4977
4978
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4978
4979
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4981
4982
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4982
4983
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4983
4984
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4984
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4985
4986
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4986
4987
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4987
4988
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4990
4991
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4991
4992
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4992
4993
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4993
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4994
4995
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4995
4996
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4996
4997
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4999
5000
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5000
5001
|
GGML_ASSERT(ncols % QK_K == 0);
|
5001
5002
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5002
|
-
const dim3 block_nums(
|
5003
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5003
5004
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5004
5005
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
5005
5006
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5008
5009
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5009
5010
|
GGML_ASSERT(ncols % QK_K == 0);
|
5010
5011
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5011
|
-
const dim3 block_nums(
|
5012
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5012
5013
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5013
5014
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
5014
5015
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5017
5018
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5018
5019
|
GGML_ASSERT(ncols % QK_K == 0);
|
5019
5020
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5020
|
-
const dim3 block_nums(
|
5021
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5021
5022
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5022
5023
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
5023
5024
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5026
5027
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5027
5028
|
GGML_ASSERT(ncols % QK_K == 0);
|
5028
5029
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5029
|
-
const dim3 block_nums(
|
5030
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5030
5031
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5031
5032
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
5032
5033
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5035
5036
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5036
5037
|
GGML_ASSERT(ncols % QK_K == 0);
|
5037
5038
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5038
|
-
const dim3 block_nums(
|
5039
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5039
5040
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5040
5041
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
5041
5042
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
5054
5055
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5055
5056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5056
5057
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5057
|
-
const dim3 block_nums(
|
5058
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5058
5059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5059
5060
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5060
5061
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5789,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5789
5790
|
CUDA_CHECK(cudaFree(ptr));
|
5790
5791
|
}
|
5791
5792
|
|
5793
|
+
static bool g_cublas_loaded = false;
|
5794
|
+
|
5795
|
+
bool ggml_cublas_loaded(void) {
|
5796
|
+
return g_cublas_loaded;
|
5797
|
+
}
|
5792
5798
|
|
5793
5799
|
void ggml_init_cublas() {
|
5794
5800
|
static bool initialized = false;
|
@@ -5802,7 +5808,12 @@ void ggml_init_cublas() {
|
|
5802
5808
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5803
5809
|
#endif
|
5804
5810
|
|
5805
|
-
|
5811
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5812
|
+
initialized = true;
|
5813
|
+
g_cublas_loaded = false;
|
5814
|
+
return;
|
5815
|
+
}
|
5816
|
+
|
5806
5817
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5807
5818
|
int64_t total_vram = 0;
|
5808
5819
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
@@ -5850,6 +5861,7 @@ void ggml_init_cublas() {
|
|
5850
5861
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5851
5862
|
|
5852
5863
|
initialized = true;
|
5864
|
+
g_cublas_loaded = true;
|
5853
5865
|
}
|
5854
5866
|
}
|
5855
5867
|
|
@@ -6892,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6892
6904
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6893
6905
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6894
6906
|
|
6907
|
+
int used_devices = 0;
|
6908
|
+
|
6895
6909
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6896
6910
|
// by default, use all rows
|
6897
6911
|
row_low[id] = 0;
|
@@ -6919,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6919
6933
|
continue;
|
6920
6934
|
}
|
6921
6935
|
|
6936
|
+
used_devices++;
|
6937
|
+
|
6922
6938
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6923
6939
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6924
6940
|
|
@@ -6957,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6957
6973
|
|
6958
6974
|
// if multiple devices are used they need to wait for the main device
|
6959
6975
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6960
|
-
if (split &&
|
6976
|
+
if (split && used_devices > 1) {
|
6961
6977
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6962
6978
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6963
6979
|
}
|
6964
6980
|
|
6965
|
-
const int64_t src1_col_stride = split &&
|
6981
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6966
6982
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6967
6983
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6968
6984
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -7078,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7078
7094
|
}
|
7079
7095
|
|
7080
7096
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7097
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7098
|
+
continue;
|
7099
|
+
}
|
7081
7100
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7082
7101
|
|
7083
7102
|
// free buffers again when done
|
@@ -7102,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7102
7121
|
|
7103
7122
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7104
7123
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7124
|
+
if (row_low[id] == row_high[id]) {
|
7125
|
+
continue;
|
7126
|
+
}
|
7105
7127
|
for (int64_t is = 0; is < is_max; ++is) {
|
7106
7128
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
7107
7129
|
}
|
@@ -7147,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7147
7169
|
}
|
7148
7170
|
|
7149
7171
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
+
if (!g_cublas_loaded) return false;
|
7173
|
+
|
7150
7174
|
const int64_t ne10 = src1->ne[0];
|
7151
7175
|
|
7152
7176
|
const int64_t ne0 = dst->ne[0];
|
@@ -7225,7 +7249,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7225
7249
|
|
7226
7250
|
__global__ void k_compute_batched_ptrs(
|
7227
7251
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
-
void **
|
7252
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7229
7253
|
int ne12, int ne13,
|
7230
7254
|
int ne23,
|
7231
7255
|
int nb02, int nb03,
|
@@ -7242,9 +7266,9 @@ __global__ void k_compute_batched_ptrs(
|
|
7242
7266
|
int i03 = i13 / r3;
|
7243
7267
|
int i02 = i12 / r2;
|
7244
7268
|
|
7245
|
-
|
7246
|
-
|
7247
|
-
|
7269
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7270
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7271
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
7272
|
}
|
7249
7273
|
|
7250
7274
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -7350,14 +7374,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7350
7374
|
// use cublasGemmBatchedEx
|
7351
7375
|
const int ne23 = ne12*ne13;
|
7352
7376
|
|
7353
|
-
void **
|
7354
|
-
|
7355
|
-
|
7377
|
+
const void ** ptrs_src = nullptr;
|
7378
|
+
void ** ptrs_dst = nullptr;
|
7379
|
+
|
7380
|
+
size_t ptrs_src_s = 0;
|
7381
|
+
size_t ptrs_dst_s = 0;
|
7382
|
+
|
7383
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7384
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7356
7385
|
|
7357
7386
|
dim3 block_dims(ne13, ne12);
|
7358
7387
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
7388
|
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
-
|
7389
|
+
ptrs_src, ptrs_dst,
|
7361
7390
|
ne12, ne13,
|
7362
7391
|
ne23,
|
7363
7392
|
nb02, nb03,
|
@@ -7369,14 +7398,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7369
7398
|
CUBLAS_CHECK(
|
7370
7399
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
7400
|
ne01, ne11, ne10,
|
7372
|
-
&alpha_f16, (const void
|
7373
|
-
(const void
|
7374
|
-
&beta_f16, ( void **
|
7401
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7402
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7403
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7375
7404
|
ne23,
|
7376
7405
|
CUBLAS_COMPUTE_16F,
|
7377
7406
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
7407
|
|
7379
|
-
|
7408
|
+
if (ptrs_src_s != 0) {
|
7409
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7410
|
+
}
|
7411
|
+
if (ptrs_dst_s != 0) {
|
7412
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7413
|
+
}
|
7380
7414
|
}
|
7381
7415
|
#endif
|
7382
7416
|
|
@@ -7389,10 +7423,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7389
7423
|
|
7390
7424
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7391
7425
|
const bool all_on_device =
|
7392
|
-
(src0->backend == GGML_BACKEND_GPU) &&
|
7426
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7393
7427
|
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
7428
|
( dst->backend == GGML_BACKEND_GPU);
|
7395
7429
|
|
7430
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7431
|
+
|
7396
7432
|
int64_t min_compute_capability = INT_MAX;
|
7397
7433
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7398
7434
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
@@ -7414,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7414
7450
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
7451
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
7452
|
|
7417
|
-
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7453
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
7454
|
// KQ single-batch
|
7419
7455
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7420
|
-
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7456
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
7457
|
// KQV single-batch
|
7422
7458
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
-
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7459
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
7460
|
// KQ + KQV multi-batch
|
7425
7461
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7426
7462
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -7820,6 +7856,8 @@ void ggml_cuda_free_scratch() {
|
|
7820
7856
|
}
|
7821
7857
|
|
7822
7858
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
+
if (!g_cublas_loaded) return false;
|
7860
|
+
|
7823
7861
|
ggml_cuda_func_t func;
|
7824
7862
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7825
7863
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
@@ -17,7 +17,12 @@ extern "C" {
|
|
17
17
|
|
18
18
|
#define GGML_CUDA_MAX_DEVICES 16
|
19
19
|
|
20
|
+
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
20
21
|
GGML_API void ggml_init_cublas(void);
|
22
|
+
|
23
|
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
24
|
+
GGML_API bool ggml_cublas_loaded(void);
|
25
|
+
|
21
26
|
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
22
27
|
GGML_API void ggml_cuda_host_free(void * ptr);
|
23
28
|
|
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
|
|
1017
1017
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
1018
1018
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
1019
1019
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
1020
|
-
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
1020
|
+
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
|
1021
1021
|
|
1022
1022
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1023
1023
|
} break;
|
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
|
|
1348
1348
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
1349
1349
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
1350
1350
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
1351
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
1351
|
+
[encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
|
1352
1352
|
|
1353
1353
|
const int64_t nrows = ggml_nrows(src0);
|
1354
1354
|
|
@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
|
|
1403
1403
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
1404
1404
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1405
1405
|
const int mode = ((int32_t *) dst->op_params)[2];
|
1406
|
-
|
1406
|
+
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
1407
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
1407
1408
|
|
1408
1409
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1409
1410
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|