llama_cpp 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +36 -47
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +30 -2
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -474
@@ -1,23 +1,21 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
+
#include "ggml.h"
|
4
|
+
|
3
5
|
#ifdef __cplusplus
|
4
6
|
extern "C" {
|
5
7
|
#endif
|
6
8
|
|
7
9
|
void ggml_cl_init(void);
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
};
|
11
|
+
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
|
+
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
|
+
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
GGML_BLAS_OP_T = 112,
|
17
|
-
GGML_BLAS_OP_C = 113,
|
18
|
-
};
|
15
|
+
void * ggml_cl_host_malloc(size_t size);
|
16
|
+
void ggml_cl_host_free(void * ptr);
|
19
17
|
|
20
|
-
void
|
18
|
+
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
21
19
|
|
22
20
|
#ifdef __cplusplus
|
23
21
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -9431,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
|
|
9431
9431
|
|
9432
9432
|
// ggml_compute_forward_mul_mat
|
9433
9433
|
|
9434
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9434
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9435
9435
|
// helper function to determine if it is better to use BLAS or not
|
9436
9436
|
// for large matrices, BLAS is faster
|
9437
9437
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
@@ -9472,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9472
9472
|
const int64_t ne02 = src0->ne[2];
|
9473
9473
|
const int64_t ne03 = src0->ne[3];
|
9474
9474
|
|
9475
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9475
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9476
9476
|
const int64_t ne10 = src1->ne[0];
|
9477
9477
|
#endif
|
9478
9478
|
const int64_t ne11 = src1->ne[1];
|
@@ -9536,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9536
9536
|
}
|
9537
9537
|
return;
|
9538
9538
|
}
|
9539
|
+
#elif defined(GGML_USE_CLBLAST)
|
9540
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9541
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9542
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9543
|
+
}
|
9544
|
+
return;
|
9545
|
+
}
|
9539
9546
|
#endif
|
9540
9547
|
|
9541
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9548
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9542
9549
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9543
9550
|
if (params->ith != 0) {
|
9544
9551
|
return;
|
@@ -9558,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9558
9565
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9559
9566
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9560
9567
|
|
9561
|
-
#if defined(GGML_USE_CLBLAST)
|
9562
|
-
// zT = y * xT
|
9563
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9564
|
-
ne11, ne01, ne10,
|
9565
|
-
1.0f, y, ne10,
|
9566
|
-
x, ne10,
|
9567
|
-
0.0f, d, ne01,
|
9568
|
-
GGML_TYPE_F32);
|
9569
|
-
#else
|
9570
9568
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9571
9569
|
ne11, ne01, ne10,
|
9572
9570
|
1.0f, y, ne10,
|
9573
9571
|
x, ne00,
|
9574
9572
|
0.0f, d, ne01);
|
9575
|
-
#endif
|
9576
9573
|
}
|
9577
9574
|
}
|
9578
9575
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
@@ -9711,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9711
9708
|
}
|
9712
9709
|
return;
|
9713
9710
|
}
|
9711
|
+
#elif defined(GGML_USE_CLBLAST)
|
9712
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9713
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9714
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9715
|
+
}
|
9716
|
+
return;
|
9717
|
+
}
|
9714
9718
|
#endif
|
9715
9719
|
|
9716
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9720
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9717
9721
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9718
9722
|
GGML_ASSERT(nb10 == sizeof(float));
|
9719
9723
|
|
@@ -9743,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9743
9747
|
assert(id*sizeof(float) <= params->wsize);
|
9744
9748
|
}
|
9745
9749
|
|
9746
|
-
#if defined(GGML_USE_CLBLAST)
|
9747
|
-
const float * x = wdata;
|
9748
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9749
|
-
|
9750
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9751
|
-
|
9752
|
-
// zT = y * xT
|
9753
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9754
|
-
ne11, ne01, ne10,
|
9755
|
-
1.0f, y, ne10,
|
9756
|
-
x, ne10,
|
9757
|
-
0.0f, d, ne01,
|
9758
|
-
GGML_TYPE_F32);
|
9759
|
-
#else
|
9760
9750
|
const float * x = wdata;
|
9761
9751
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9762
9752
|
|
@@ -9768,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9768
9758
|
1.0f, y, ne10,
|
9769
9759
|
x, ne00,
|
9770
9760
|
0.0f, d, ne01);
|
9771
|
-
#endif
|
9772
9761
|
}
|
9773
9762
|
}
|
9774
9763
|
|
@@ -9931,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9931
9920
|
}
|
9932
9921
|
return;
|
9933
9922
|
}
|
9923
|
+
#elif defined(GGML_USE_CLBLAST)
|
9924
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9925
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9926
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9927
|
+
}
|
9928
|
+
return;
|
9929
|
+
}
|
9934
9930
|
#endif
|
9935
9931
|
|
9936
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9932
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9937
9933
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9938
9934
|
if (params->ith != 0) {
|
9939
9935
|
return;
|
@@ -9956,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9956
9952
|
|
9957
9953
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9958
9954
|
|
9959
|
-
#if defined(GGML_USE_CLBLAST)
|
9960
|
-
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
9961
|
-
#else
|
9962
9955
|
{
|
9963
9956
|
size_t id = 0;
|
9964
9957
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
@@ -9970,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9970
9963
|
}
|
9971
9964
|
|
9972
9965
|
const float * x = wdata;
|
9973
|
-
#endif
|
9974
9966
|
|
9975
|
-
#if defined(GGML_USE_CLBLAST)
|
9976
|
-
// zT = y * xT
|
9977
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9978
|
-
ne11, ne01, ne10,
|
9979
|
-
1.0f, y, ne10,
|
9980
|
-
x, ne10,
|
9981
|
-
0.0f, d, ne01,
|
9982
|
-
type);
|
9983
|
-
#else
|
9984
9967
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9985
9968
|
ne11, ne01, ne10,
|
9986
9969
|
1.0f, y, ne10,
|
9987
9970
|
x, ne00,
|
9988
9971
|
0.0f, d, ne01);
|
9989
|
-
#endif
|
9990
9972
|
}
|
9991
9973
|
}
|
9992
9974
|
|
@@ -14165,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14165
14147
|
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14166
14148
|
}
|
14167
14149
|
else
|
14150
|
+
#elif defined(GGML_USE_CLBLAST)
|
14151
|
+
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
14152
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14153
|
+
// the threads are still spinning
|
14154
|
+
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
14155
|
+
}
|
14156
|
+
else
|
14168
14157
|
#endif
|
14169
14158
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
14170
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14159
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14171
14160
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14172
14161
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14173
14162
|
// the threads are still spinning
|
@@ -14181,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14181
14170
|
#endif
|
14182
14171
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
14183
14172
|
cur = 0;
|
14184
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14173
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14185
14174
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14186
14175
|
node->n_tasks = 1;
|
14187
14176
|
}
|
14188
14177
|
#endif
|
14189
14178
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
14190
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14179
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14191
14180
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
14192
14181
|
node->n_tasks = 1;
|
14193
14182
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,8 @@
|
|
12
12
|
#include "ggml.h"
|
13
13
|
#ifdef GGML_USE_CUBLAS
|
14
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
15
17
|
#endif
|
16
18
|
|
17
19
|
#include <array>
|
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
|
|
1092
1094
|
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
1095
|
}
|
1094
1096
|
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
-
#
|
1097
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1096
1098
|
(void) n_gpu_layers;
|
1097
1099
|
#endif
|
1098
1100
|
}
|
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
|
|
1125
1127
|
done_size += lt.size;
|
1126
1128
|
}
|
1127
1129
|
}
|
1128
|
-
#
|
1130
|
+
#elif defined(GGML_USE_CLBLAST)
|
1131
|
+
{
|
1132
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1133
|
+
|
1134
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1135
|
+
|
1136
|
+
size_t vram_total = 0;
|
1137
|
+
|
1138
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1139
|
+
const auto & layer = model.layers[i];
|
1140
|
+
|
1141
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1148
|
+
}
|
1149
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1150
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1151
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1152
|
+
}
|
1153
|
+
|
1154
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1155
|
+
}
|
1156
|
+
#endif
|
1129
1157
|
|
1130
1158
|
if (progress_callback) {
|
1131
1159
|
progress_callback(1.0f, progress_callback_user_data);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-66874d4'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|