llama_cpp 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -249,6 +250,7 @@ extern "C" {
249
250
  enum ggml_backend {
250
251
  GGML_BACKEND_CPU = 0,
251
252
  GGML_BACKEND_CUDA = 1,
253
+ GGML_BACKEND_CL = 2,
252
254
  };
253
255
 
254
256
  // model file types
@@ -371,11 +373,13 @@ extern "C" {
371
373
 
372
374
  void * data;
373
375
 
374
- char name[32];
376
+ char name[GGML_MAX_NAME];
375
377
 
376
378
  char padding[16];
377
379
  };
378
380
 
381
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
382
+
379
383
  // computation graph
380
384
  struct ggml_cgraph {
381
385
  int n_nodes;
@@ -428,6 +432,7 @@ extern "C" {
428
432
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
429
433
 
430
434
  GGML_API const char * ggml_type_name(enum ggml_type type);
435
+ GGML_API const char * ggml_op_name (enum ggml_op op);
431
436
 
432
437
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
433
438
 
@@ -436,6 +441,9 @@ extern "C" {
436
441
  // TODO: temporary until model loading of ggml examples is refactored
437
442
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
438
443
 
444
+ // use this to compute the memory overhead of a tensor
445
+ GGML_API size_t ggml_tensor_overhead(void);
446
+
439
447
  // main
440
448
 
441
449
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -443,7 +451,11 @@ extern "C" {
443
451
 
444
452
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
445
453
 
446
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
454
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
455
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
456
+
457
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
458
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
447
459
 
448
460
  GGML_API struct ggml_tensor * ggml_new_tensor(
449
461
  struct ggml_context * ctx,
@@ -483,6 +495,8 @@ extern "C" {
483
495
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
484
496
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
485
497
 
498
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
499
+
486
500
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
487
501
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
488
502
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -969,6 +983,11 @@ extern "C" {
969
983
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
970
984
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
971
985
 
986
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
987
+
988
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
989
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
990
+
972
991
  // print info and performance information for the graph
973
992
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
974
993
 
@@ -12,6 +12,8 @@
12
12
  #include "ggml.h"
13
13
  #ifdef GGML_USE_CUBLAS
14
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
15
17
  #endif
16
18
 
17
19
  #include <array>
@@ -40,6 +42,7 @@
40
42
  // available llama models
41
43
  enum e_model {
42
44
  MODEL_UNKNOWN,
45
+ MODEL_3B,
43
46
  MODEL_7B,
44
47
  MODEL_13B,
45
48
  MODEL_30B,
@@ -56,6 +59,7 @@ static const size_t MB = 1024*1024;
56
59
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
57
60
  {
58
61
  static std::map<e_model, size_t> k_sizes = {
62
+ { MODEL_3B, 128ull * MB },
59
63
  { MODEL_7B, 512ull * MB },
60
64
  { MODEL_13B, 512ull * MB },
61
65
  { MODEL_30B, 512ull * MB },
@@ -67,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
67
71
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
68
72
  {
69
73
  static std::map<e_model, size_t> k_sizes = {
74
+ { MODEL_3B, 128ull * MB },
70
75
  { MODEL_7B, 512ull * MB },
71
76
  { MODEL_13B, 512ull * MB },
72
77
  { MODEL_30B, 512ull * MB },
@@ -79,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
79
84
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
80
85
  {
81
86
  static std::map<e_model, size_t> k_sizes = {
87
+ { MODEL_3B, 682ull * MB },
82
88
  { MODEL_7B, 1026ull * MB },
83
89
  { MODEL_13B, 1608ull * MB },
84
90
  { MODEL_30B, 3124ull * MB },
@@ -92,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
92
98
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
93
99
  {
94
100
  static std::map<e_model, size_t> k_sizes = {
101
+ { MODEL_3B, 512ull * MB },
95
102
  { MODEL_7B, 768ull * MB },
96
103
  { MODEL_13B, 1024ull * MB },
97
104
  { MODEL_30B, 1280ull * MB },
@@ -897,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
897
904
 
898
905
  static const char *llama_model_type_name(e_model type) {
899
906
  switch (type) {
907
+ case MODEL_3B: return "3B";
900
908
  case MODEL_7B: return "7B";
901
909
  case MODEL_13B: return "13B";
902
910
  case MODEL_30B: return "30B";
@@ -930,6 +938,7 @@ static void llama_model_load_internal(
930
938
 
931
939
  {
932
940
  switch (hparams.n_layer) {
941
+ case 26: model.type = e_model::MODEL_3B; break;
933
942
  case 32: model.type = e_model::MODEL_7B; break;
934
943
  case 40: model.type = e_model::MODEL_13B; break;
935
944
  case 60: model.type = e_model::MODEL_30B; break;
@@ -1092,7 +1101,7 @@ static void llama_model_load_internal(
1092
1101
  fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
1102
  }
1094
1103
  fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
- #else
1104
+ #elif !defined(GGML_USE_CLBLAST)
1096
1105
  (void) n_gpu_layers;
1097
1106
  #endif
1098
1107
  }
@@ -1125,7 +1134,33 @@ static void llama_model_load_internal(
1125
1134
  done_size += lt.size;
1126
1135
  }
1127
1136
  }
1128
- #endif // GGML_USE_CUBLAS
1137
+ #elif defined(GGML_USE_CLBLAST)
1138
+ {
1139
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
+
1141
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
+
1143
+ size_t vram_total = 0;
1144
+
1145
+ for (int i = 0; i < n_gpu; ++i) {
1146
+ const auto & layer = model.layers[i];
1147
+
1148
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1155
+ }
1156
+ if (n_gpu_layers > (int) hparams.n_layer) {
1157
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1159
+ }
1160
+
1161
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1162
+ }
1163
+ #endif
1129
1164
 
1130
1165
  if (progress_callback) {
1131
1166
  progress_callback(1.0f, progress_callback_user_data);
@@ -31,6 +31,11 @@
31
31
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
32
  #define LLAMA_SESSION_VERSION 1
33
33
 
34
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
+ #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
+ #endif
38
+
34
39
  #ifdef __cplusplus
35
40
  extern "C" {
36
41
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-265db98'
9
+ LLAMA_CPP_VERSION = 'master-ffb06a3'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-22 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h