llama_cpp 0.1.2 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -7
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +1028 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +568 -57
- data/ext/llama_cpp/src/ggml.h +21 -2
- data/ext/llama_cpp/src/llama.cpp +37 -2
- data/ext/llama_cpp/src/llama.h +5 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -474
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -249,6 +250,7 @@ extern "C" {
|
|
249
250
|
enum ggml_backend {
|
250
251
|
GGML_BACKEND_CPU = 0,
|
251
252
|
GGML_BACKEND_CUDA = 1,
|
253
|
+
GGML_BACKEND_CL = 2,
|
252
254
|
};
|
253
255
|
|
254
256
|
// model file types
|
@@ -371,11 +373,13 @@ extern "C" {
|
|
371
373
|
|
372
374
|
void * data;
|
373
375
|
|
374
|
-
char name[
|
376
|
+
char name[GGML_MAX_NAME];
|
375
377
|
|
376
378
|
char padding[16];
|
377
379
|
};
|
378
380
|
|
381
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
382
|
+
|
379
383
|
// computation graph
|
380
384
|
struct ggml_cgraph {
|
381
385
|
int n_nodes;
|
@@ -428,6 +432,7 @@ extern "C" {
|
|
428
432
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
429
433
|
|
430
434
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
435
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
431
436
|
|
432
437
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
433
438
|
|
@@ -436,6 +441,9 @@ extern "C" {
|
|
436
441
|
// TODO: temporary until model loading of ggml examples is refactored
|
437
442
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
438
443
|
|
444
|
+
// use this to compute the memory overhead of a tensor
|
445
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
446
|
+
|
439
447
|
// main
|
440
448
|
|
441
449
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -443,7 +451,11 @@ extern "C" {
|
|
443
451
|
|
444
452
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
445
453
|
|
446
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
454
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
455
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
456
|
+
|
457
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
458
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
447
459
|
|
448
460
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
449
461
|
struct ggml_context * ctx,
|
@@ -483,6 +495,8 @@ extern "C" {
|
|
483
495
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
484
496
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
497
|
|
498
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
499
|
+
|
486
500
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
487
501
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
488
502
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -969,6 +983,11 @@ extern "C" {
|
|
969
983
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
970
984
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
971
985
|
|
986
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
987
|
+
|
988
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
989
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
990
|
+
|
972
991
|
// print info and performance information for the graph
|
973
992
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
974
993
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,8 @@
|
|
12
12
|
#include "ggml.h"
|
13
13
|
#ifdef GGML_USE_CUBLAS
|
14
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
15
17
|
#endif
|
16
18
|
|
17
19
|
#include <array>
|
@@ -40,6 +42,7 @@
|
|
40
42
|
// available llama models
|
41
43
|
enum e_model {
|
42
44
|
MODEL_UNKNOWN,
|
45
|
+
MODEL_3B,
|
43
46
|
MODEL_7B,
|
44
47
|
MODEL_13B,
|
45
48
|
MODEL_30B,
|
@@ -56,6 +59,7 @@ static const size_t MB = 1024*1024;
|
|
56
59
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
57
60
|
{
|
58
61
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
+
{ MODEL_3B, 128ull * MB },
|
59
63
|
{ MODEL_7B, 512ull * MB },
|
60
64
|
{ MODEL_13B, 512ull * MB },
|
61
65
|
{ MODEL_30B, 512ull * MB },
|
@@ -67,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
67
71
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
68
72
|
{
|
69
73
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
+
{ MODEL_3B, 128ull * MB },
|
70
75
|
{ MODEL_7B, 512ull * MB },
|
71
76
|
{ MODEL_13B, 512ull * MB },
|
72
77
|
{ MODEL_30B, 512ull * MB },
|
@@ -79,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
79
84
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
80
85
|
{
|
81
86
|
static std::map<e_model, size_t> k_sizes = {
|
87
|
+
{ MODEL_3B, 682ull * MB },
|
82
88
|
{ MODEL_7B, 1026ull * MB },
|
83
89
|
{ MODEL_13B, 1608ull * MB },
|
84
90
|
{ MODEL_30B, 3124ull * MB },
|
@@ -92,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
92
98
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
93
99
|
{
|
94
100
|
static std::map<e_model, size_t> k_sizes = {
|
101
|
+
{ MODEL_3B, 512ull * MB },
|
95
102
|
{ MODEL_7B, 768ull * MB },
|
96
103
|
{ MODEL_13B, 1024ull * MB },
|
97
104
|
{ MODEL_30B, 1280ull * MB },
|
@@ -897,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
897
904
|
|
898
905
|
static const char *llama_model_type_name(e_model type) {
|
899
906
|
switch (type) {
|
907
|
+
case MODEL_3B: return "3B";
|
900
908
|
case MODEL_7B: return "7B";
|
901
909
|
case MODEL_13B: return "13B";
|
902
910
|
case MODEL_30B: return "30B";
|
@@ -930,6 +938,7 @@ static void llama_model_load_internal(
|
|
930
938
|
|
931
939
|
{
|
932
940
|
switch (hparams.n_layer) {
|
941
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
933
942
|
case 32: model.type = e_model::MODEL_7B; break;
|
934
943
|
case 40: model.type = e_model::MODEL_13B; break;
|
935
944
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -1092,7 +1101,7 @@ static void llama_model_load_internal(
|
|
1092
1101
|
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
1102
|
}
|
1094
1103
|
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
-
#
|
1104
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1096
1105
|
(void) n_gpu_layers;
|
1097
1106
|
#endif
|
1098
1107
|
}
|
@@ -1125,7 +1134,33 @@ static void llama_model_load_internal(
|
|
1125
1134
|
done_size += lt.size;
|
1126
1135
|
}
|
1127
1136
|
}
|
1128
|
-
#
|
1137
|
+
#elif defined(GGML_USE_CLBLAST)
|
1138
|
+
{
|
1139
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1140
|
+
|
1141
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1142
|
+
|
1143
|
+
size_t vram_total = 0;
|
1144
|
+
|
1145
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1146
|
+
const auto & layer = model.layers[i];
|
1147
|
+
|
1148
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1155
|
+
}
|
1156
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1157
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1158
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1159
|
+
}
|
1160
|
+
|
1161
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1162
|
+
}
|
1163
|
+
#endif
|
1129
1164
|
|
1130
1165
|
if (progress_callback) {
|
1131
1166
|
progress_callback(1.0f, progress_callback_user_data);
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -31,6 +31,11 @@
|
|
31
31
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
32
|
#define LLAMA_SESSION_VERSION 1
|
33
33
|
|
34
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
35
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
|
+
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
|
+
#endif
|
38
|
+
|
34
39
|
#ifdef __cplusplus
|
35
40
|
extern "C" {
|
36
41
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-ffb06a3'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|