llama_cpp 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -249,6 +250,7 @@ extern "C" {
249
250
  enum ggml_backend {
250
251
  GGML_BACKEND_CPU = 0,
251
252
  GGML_BACKEND_CUDA = 1,
253
+ GGML_BACKEND_CL = 2,
252
254
  };
253
255
 
254
256
  // model file types
@@ -371,11 +373,13 @@ extern "C" {
371
373
 
372
374
  void * data;
373
375
 
374
- char name[32];
376
+ char name[GGML_MAX_NAME];
375
377
 
376
378
  char padding[16];
377
379
  };
378
380
 
381
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
382
+
379
383
  // computation graph
380
384
  struct ggml_cgraph {
381
385
  int n_nodes;
@@ -428,6 +432,7 @@ extern "C" {
428
432
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
429
433
 
430
434
  GGML_API const char * ggml_type_name(enum ggml_type type);
435
+ GGML_API const char * ggml_op_name (enum ggml_op op);
431
436
 
432
437
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
433
438
 
@@ -436,6 +441,9 @@ extern "C" {
436
441
  // TODO: temporary until model loading of ggml examples is refactored
437
442
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
438
443
 
444
+ // use this to compute the memory overhead of a tensor
445
+ GGML_API size_t ggml_tensor_overhead(void);
446
+
439
447
  // main
440
448
 
441
449
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -443,7 +451,11 @@ extern "C" {
443
451
 
444
452
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
445
453
 
446
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
454
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
455
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
456
+
457
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
458
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
447
459
 
448
460
  GGML_API struct ggml_tensor * ggml_new_tensor(
449
461
  struct ggml_context * ctx,
@@ -483,6 +495,8 @@ extern "C" {
483
495
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
484
496
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
485
497
 
498
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
499
+
486
500
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
487
501
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
488
502
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -969,6 +983,11 @@ extern "C" {
969
983
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
970
984
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
971
985
 
986
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
987
+
988
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
989
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
990
+
972
991
  // print info and performance information for the graph
973
992
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
974
993
 
@@ -12,6 +12,8 @@
12
12
  #include "ggml.h"
13
13
  #ifdef GGML_USE_CUBLAS
14
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
15
17
  #endif
16
18
 
17
19
  #include <array>
@@ -40,6 +42,7 @@
40
42
  // available llama models
41
43
  enum e_model {
42
44
  MODEL_UNKNOWN,
45
+ MODEL_3B,
43
46
  MODEL_7B,
44
47
  MODEL_13B,
45
48
  MODEL_30B,
@@ -56,6 +59,7 @@ static const size_t MB = 1024*1024;
56
59
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
57
60
  {
58
61
  static std::map<e_model, size_t> k_sizes = {
62
+ { MODEL_3B, 128ull * MB },
59
63
  { MODEL_7B, 512ull * MB },
60
64
  { MODEL_13B, 512ull * MB },
61
65
  { MODEL_30B, 512ull * MB },
@@ -67,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
67
71
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
68
72
  {
69
73
  static std::map<e_model, size_t> k_sizes = {
74
+ { MODEL_3B, 128ull * MB },
70
75
  { MODEL_7B, 512ull * MB },
71
76
  { MODEL_13B, 512ull * MB },
72
77
  { MODEL_30B, 512ull * MB },
@@ -79,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
79
84
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
80
85
  {
81
86
  static std::map<e_model, size_t> k_sizes = {
87
+ { MODEL_3B, 682ull * MB },
82
88
  { MODEL_7B, 1026ull * MB },
83
89
  { MODEL_13B, 1608ull * MB },
84
90
  { MODEL_30B, 3124ull * MB },
@@ -92,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
92
98
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
93
99
  {
94
100
  static std::map<e_model, size_t> k_sizes = {
101
+ { MODEL_3B, 512ull * MB },
95
102
  { MODEL_7B, 768ull * MB },
96
103
  { MODEL_13B, 1024ull * MB },
97
104
  { MODEL_30B, 1280ull * MB },
@@ -897,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
897
904
 
898
905
  static const char *llama_model_type_name(e_model type) {
899
906
  switch (type) {
907
+ case MODEL_3B: return "3B";
900
908
  case MODEL_7B: return "7B";
901
909
  case MODEL_13B: return "13B";
902
910
  case MODEL_30B: return "30B";
@@ -930,6 +938,7 @@ static void llama_model_load_internal(
930
938
 
931
939
  {
932
940
  switch (hparams.n_layer) {
941
+ case 26: model.type = e_model::MODEL_3B; break;
933
942
  case 32: model.type = e_model::MODEL_7B; break;
934
943
  case 40: model.type = e_model::MODEL_13B; break;
935
944
  case 60: model.type = e_model::MODEL_30B; break;
@@ -1092,7 +1101,7 @@ static void llama_model_load_internal(
1092
1101
  fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
1102
  }
1094
1103
  fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
- #else
1104
+ #elif !defined(GGML_USE_CLBLAST)
1096
1105
  (void) n_gpu_layers;
1097
1106
  #endif
1098
1107
  }
@@ -1125,7 +1134,33 @@ static void llama_model_load_internal(
1125
1134
  done_size += lt.size;
1126
1135
  }
1127
1136
  }
1128
- #endif // GGML_USE_CUBLAS
1137
+ #elif defined(GGML_USE_CLBLAST)
1138
+ {
1139
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
+
1141
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
+
1143
+ size_t vram_total = 0;
1144
+
1145
+ for (int i = 0; i < n_gpu; ++i) {
1146
+ const auto & layer = model.layers[i];
1147
+
1148
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1155
+ }
1156
+ if (n_gpu_layers > (int) hparams.n_layer) {
1157
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1159
+ }
1160
+
1161
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1162
+ }
1163
+ #endif
1129
1164
 
1130
1165
  if (progress_callback) {
1131
1166
  progress_callback(1.0f, progress_callback_user_data);
@@ -31,6 +31,11 @@
31
31
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
32
  #define LLAMA_SESSION_VERSION 1
33
33
 
34
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
+ #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
+ #endif
38
+
34
39
  #ifdef __cplusplus
35
40
  extern "C" {
36
41
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-265db98'
9
+ LLAMA_CPP_VERSION = 'master-ffb06a3'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-22 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h