llama_cpp 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +36 -47
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +30 -2
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -474
@@ -1,23 +1,21 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
+
#include "ggml.h"
|
4
|
+
|
3
5
|
#ifdef __cplusplus
|
4
6
|
extern "C" {
|
5
7
|
#endif
|
6
8
|
|
7
9
|
void ggml_cl_init(void);
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
};
|
11
|
+
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
|
+
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
|
+
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
GGML_BLAS_OP_T = 112,
|
17
|
-
GGML_BLAS_OP_C = 113,
|
18
|
-
};
|
15
|
+
void * ggml_cl_host_malloc(size_t size);
|
16
|
+
void ggml_cl_host_free(void * ptr);
|
19
17
|
|
20
|
-
void
|
18
|
+
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
21
19
|
|
22
20
|
#ifdef __cplusplus
|
23
21
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -9431,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
|
|
9431
9431
|
|
9432
9432
|
// ggml_compute_forward_mul_mat
|
9433
9433
|
|
9434
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9434
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9435
9435
|
// helper function to determine if it is better to use BLAS or not
|
9436
9436
|
// for large matrices, BLAS is faster
|
9437
9437
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
@@ -9472,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9472
9472
|
const int64_t ne02 = src0->ne[2];
|
9473
9473
|
const int64_t ne03 = src0->ne[3];
|
9474
9474
|
|
9475
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9475
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9476
9476
|
const int64_t ne10 = src1->ne[0];
|
9477
9477
|
#endif
|
9478
9478
|
const int64_t ne11 = src1->ne[1];
|
@@ -9536,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9536
9536
|
}
|
9537
9537
|
return;
|
9538
9538
|
}
|
9539
|
+
#elif defined(GGML_USE_CLBLAST)
|
9540
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9541
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9542
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9543
|
+
}
|
9544
|
+
return;
|
9545
|
+
}
|
9539
9546
|
#endif
|
9540
9547
|
|
9541
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9548
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9542
9549
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9543
9550
|
if (params->ith != 0) {
|
9544
9551
|
return;
|
@@ -9558,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9558
9565
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9559
9566
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9560
9567
|
|
9561
|
-
#if defined(GGML_USE_CLBLAST)
|
9562
|
-
// zT = y * xT
|
9563
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9564
|
-
ne11, ne01, ne10,
|
9565
|
-
1.0f, y, ne10,
|
9566
|
-
x, ne10,
|
9567
|
-
0.0f, d, ne01,
|
9568
|
-
GGML_TYPE_F32);
|
9569
|
-
#else
|
9570
9568
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9571
9569
|
ne11, ne01, ne10,
|
9572
9570
|
1.0f, y, ne10,
|
9573
9571
|
x, ne00,
|
9574
9572
|
0.0f, d, ne01);
|
9575
|
-
#endif
|
9576
9573
|
}
|
9577
9574
|
}
|
9578
9575
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
@@ -9711,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9711
9708
|
}
|
9712
9709
|
return;
|
9713
9710
|
}
|
9711
|
+
#elif defined(GGML_USE_CLBLAST)
|
9712
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9713
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9714
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9715
|
+
}
|
9716
|
+
return;
|
9717
|
+
}
|
9714
9718
|
#endif
|
9715
9719
|
|
9716
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9720
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9717
9721
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9718
9722
|
GGML_ASSERT(nb10 == sizeof(float));
|
9719
9723
|
|
@@ -9743,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9743
9747
|
assert(id*sizeof(float) <= params->wsize);
|
9744
9748
|
}
|
9745
9749
|
|
9746
|
-
#if defined(GGML_USE_CLBLAST)
|
9747
|
-
const float * x = wdata;
|
9748
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9749
|
-
|
9750
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9751
|
-
|
9752
|
-
// zT = y * xT
|
9753
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9754
|
-
ne11, ne01, ne10,
|
9755
|
-
1.0f, y, ne10,
|
9756
|
-
x, ne10,
|
9757
|
-
0.0f, d, ne01,
|
9758
|
-
GGML_TYPE_F32);
|
9759
|
-
#else
|
9760
9750
|
const float * x = wdata;
|
9761
9751
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9762
9752
|
|
@@ -9768,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9768
9758
|
1.0f, y, ne10,
|
9769
9759
|
x, ne00,
|
9770
9760
|
0.0f, d, ne01);
|
9771
|
-
#endif
|
9772
9761
|
}
|
9773
9762
|
}
|
9774
9763
|
|
@@ -9931,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9931
9920
|
}
|
9932
9921
|
return;
|
9933
9922
|
}
|
9923
|
+
#elif defined(GGML_USE_CLBLAST)
|
9924
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9925
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9926
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9927
|
+
}
|
9928
|
+
return;
|
9929
|
+
}
|
9934
9930
|
#endif
|
9935
9931
|
|
9936
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9932
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9937
9933
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9938
9934
|
if (params->ith != 0) {
|
9939
9935
|
return;
|
@@ -9956,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9956
9952
|
|
9957
9953
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9958
9954
|
|
9959
|
-
#if defined(GGML_USE_CLBLAST)
|
9960
|
-
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
9961
|
-
#else
|
9962
9955
|
{
|
9963
9956
|
size_t id = 0;
|
9964
9957
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
@@ -9970,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9970
9963
|
}
|
9971
9964
|
|
9972
9965
|
const float * x = wdata;
|
9973
|
-
#endif
|
9974
9966
|
|
9975
|
-
#if defined(GGML_USE_CLBLAST)
|
9976
|
-
// zT = y * xT
|
9977
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9978
|
-
ne11, ne01, ne10,
|
9979
|
-
1.0f, y, ne10,
|
9980
|
-
x, ne10,
|
9981
|
-
0.0f, d, ne01,
|
9982
|
-
type);
|
9983
|
-
#else
|
9984
9967
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9985
9968
|
ne11, ne01, ne10,
|
9986
9969
|
1.0f, y, ne10,
|
9987
9970
|
x, ne00,
|
9988
9971
|
0.0f, d, ne01);
|
9989
|
-
#endif
|
9990
9972
|
}
|
9991
9973
|
}
|
9992
9974
|
|
@@ -14165,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14165
14147
|
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14166
14148
|
}
|
14167
14149
|
else
|
14150
|
+
#elif defined(GGML_USE_CLBLAST)
|
14151
|
+
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
14152
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14153
|
+
// the threads are still spinning
|
14154
|
+
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
14155
|
+
}
|
14156
|
+
else
|
14168
14157
|
#endif
|
14169
14158
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
14170
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14159
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14171
14160
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14172
14161
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14173
14162
|
// the threads are still spinning
|
@@ -14181,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14181
14170
|
#endif
|
14182
14171
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
14183
14172
|
cur = 0;
|
14184
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14173
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14185
14174
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14186
14175
|
node->n_tasks = 1;
|
14187
14176
|
}
|
14188
14177
|
#endif
|
14189
14178
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
14190
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14179
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14191
14180
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14192
14181
|
node->n_tasks = 1;
|
14193
14182
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,8 @@
|
|
12
12
|
#include "ggml.h"
|
13
13
|
#ifdef GGML_USE_CUBLAS
|
14
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
15
17
|
#endif
|
16
18
|
|
17
19
|
#include <array>
|
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
|
|
1092
1094
|
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
1095
|
}
|
1094
1096
|
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
-
#
|
1097
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1096
1098
|
(void) n_gpu_layers;
|
1097
1099
|
#endif
|
1098
1100
|
}
|
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
|
|
1125
1127
|
done_size += lt.size;
|
1126
1128
|
}
|
1127
1129
|
}
|
1128
|
-
#
|
1130
|
+
#elif defined(GGML_USE_CLBLAST)
|
1131
|
+
{
|
1132
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1133
|
+
|
1134
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1135
|
+
|
1136
|
+
size_t vram_total = 0;
|
1137
|
+
|
1138
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1139
|
+
const auto & layer = model.layers[i];
|
1140
|
+
|
1141
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1148
|
+
}
|
1149
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1150
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1151
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1152
|
+
}
|
1153
|
+
|
1154
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1155
|
+
}
|
1156
|
+
#endif
|
1129
1157
|
|
1130
1158
|
if (progress_callback) {
|
1131
1159
|
progress_callback(1.0f, progress_callback_user_data);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-66874d4'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|