llama_cpp 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -7
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +1028 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +568 -57
- data/ext/llama_cpp/src/ggml.h +21 -2
- data/ext/llama_cpp/src/llama.cpp +37 -2
- data/ext/llama_cpp/src/llama.h +5 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -474
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -249,6 +250,7 @@ extern "C" {
|
|
249
250
|
enum ggml_backend {
|
250
251
|
GGML_BACKEND_CPU = 0,
|
251
252
|
GGML_BACKEND_CUDA = 1,
|
253
|
+
GGML_BACKEND_CL = 2,
|
252
254
|
};
|
253
255
|
|
254
256
|
// model file types
|
@@ -371,11 +373,13 @@ extern "C" {
|
|
371
373
|
|
372
374
|
void * data;
|
373
375
|
|
374
|
-
char name[
|
376
|
+
char name[GGML_MAX_NAME];
|
375
377
|
|
376
378
|
char padding[16];
|
377
379
|
};
|
378
380
|
|
381
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
382
|
+
|
379
383
|
// computation graph
|
380
384
|
struct ggml_cgraph {
|
381
385
|
int n_nodes;
|
@@ -428,6 +432,7 @@ extern "C" {
|
|
428
432
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
429
433
|
|
430
434
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
435
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
431
436
|
|
432
437
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
433
438
|
|
@@ -436,6 +441,9 @@ extern "C" {
|
|
436
441
|
// TODO: temporary until model loading of ggml examples is refactored
|
437
442
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
438
443
|
|
444
|
+
// use this to compute the memory overhead of a tensor
|
445
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
446
|
+
|
439
447
|
// main
|
440
448
|
|
441
449
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -443,7 +451,11 @@ extern "C" {
|
|
443
451
|
|
444
452
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
445
453
|
|
446
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
454
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
455
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
456
|
+
|
457
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
458
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
447
459
|
|
448
460
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
449
461
|
struct ggml_context * ctx,
|
@@ -483,6 +495,8 @@ extern "C" {
|
|
483
495
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
484
496
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
497
|
|
498
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
499
|
+
|
486
500
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
487
501
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
488
502
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -969,6 +983,11 @@ extern "C" {
|
|
969
983
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
970
984
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
971
985
|
|
986
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
987
|
+
|
988
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
989
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
990
|
+
|
972
991
|
// print info and performance information for the graph
|
973
992
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
974
993
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,8 @@
|
|
12
12
|
#include "ggml.h"
|
13
13
|
#ifdef GGML_USE_CUBLAS
|
14
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
15
17
|
#endif
|
16
18
|
|
17
19
|
#include <array>
|
@@ -40,6 +42,7 @@
|
|
40
42
|
// available llama models
|
41
43
|
enum e_model {
|
42
44
|
MODEL_UNKNOWN,
|
45
|
+
MODEL_3B,
|
43
46
|
MODEL_7B,
|
44
47
|
MODEL_13B,
|
45
48
|
MODEL_30B,
|
@@ -56,6 +59,7 @@ static const size_t MB = 1024*1024;
|
|
56
59
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
57
60
|
{
|
58
61
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
+
{ MODEL_3B, 128ull * MB },
|
59
63
|
{ MODEL_7B, 512ull * MB },
|
60
64
|
{ MODEL_13B, 512ull * MB },
|
61
65
|
{ MODEL_30B, 512ull * MB },
|
@@ -67,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
67
71
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
68
72
|
{
|
69
73
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
+
{ MODEL_3B, 128ull * MB },
|
70
75
|
{ MODEL_7B, 512ull * MB },
|
71
76
|
{ MODEL_13B, 512ull * MB },
|
72
77
|
{ MODEL_30B, 512ull * MB },
|
@@ -79,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
79
84
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
80
85
|
{
|
81
86
|
static std::map<e_model, size_t> k_sizes = {
|
87
|
+
{ MODEL_3B, 682ull * MB },
|
82
88
|
{ MODEL_7B, 1026ull * MB },
|
83
89
|
{ MODEL_13B, 1608ull * MB },
|
84
90
|
{ MODEL_30B, 3124ull * MB },
|
@@ -92,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
92
98
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
93
99
|
{
|
94
100
|
static std::map<e_model, size_t> k_sizes = {
|
101
|
+
{ MODEL_3B, 512ull * MB },
|
95
102
|
{ MODEL_7B, 768ull * MB },
|
96
103
|
{ MODEL_13B, 1024ull * MB },
|
97
104
|
{ MODEL_30B, 1280ull * MB },
|
@@ -897,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
897
904
|
|
898
905
|
static const char *llama_model_type_name(e_model type) {
|
899
906
|
switch (type) {
|
907
|
+
case MODEL_3B: return "3B";
|
900
908
|
case MODEL_7B: return "7B";
|
901
909
|
case MODEL_13B: return "13B";
|
902
910
|
case MODEL_30B: return "30B";
|
@@ -930,6 +938,7 @@ static void llama_model_load_internal(
|
|
930
938
|
|
931
939
|
{
|
932
940
|
switch (hparams.n_layer) {
|
941
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
933
942
|
case 32: model.type = e_model::MODEL_7B; break;
|
934
943
|
case 40: model.type = e_model::MODEL_13B; break;
|
935
944
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -1092,7 +1101,7 @@ static void llama_model_load_internal(
|
|
1092
1101
|
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
1102
|
}
|
1094
1103
|
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
-
#
|
1104
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1096
1105
|
(void) n_gpu_layers;
|
1097
1106
|
#endif
|
1098
1107
|
}
|
@@ -1125,7 +1134,33 @@ static void llama_model_load_internal(
|
|
1125
1134
|
done_size += lt.size;
|
1126
1135
|
}
|
1127
1136
|
}
|
1128
|
-
#
|
1137
|
+
#elif defined(GGML_USE_CLBLAST)
|
1138
|
+
{
|
1139
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1140
|
+
|
1141
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1142
|
+
|
1143
|
+
size_t vram_total = 0;
|
1144
|
+
|
1145
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1146
|
+
const auto & layer = model.layers[i];
|
1147
|
+
|
1148
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1155
|
+
}
|
1156
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1157
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1158
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1159
|
+
}
|
1160
|
+
|
1161
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1162
|
+
}
|
1163
|
+
#endif
|
1129
1164
|
|
1130
1165
|
if (progress_callback) {
|
1131
1166
|
progress_callback(1.0f, progress_callback_user_data);
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -31,6 +31,11 @@
|
|
31
31
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
32
|
#define LLAMA_SESSION_VERSION 1
|
33
33
|
|
34
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
35
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
|
+
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
|
+
#endif
|
38
|
+
|
34
39
|
#ifdef __cplusplus
|
35
40
|
extern "C" {
|
36
41
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-ffb06a3'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|