llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,13 @@
1
1
  #ifndef LLAMA_H
2
2
  #define LLAMA_H
3
3
 
4
+ #include "ggml.h"
5
+ #ifdef GGML_USE_CUBLAS
6
+ #include "ggml-cuda.h"
7
+ #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
+ #else
9
+ #define LLAMA_MAX_DEVICES 1
10
+ #endif // GGML_USE_CUBLAS
4
11
  #include <stddef.h>
5
12
  #include <stdint.h>
6
13
  #include <stdbool.h>
@@ -31,7 +38,7 @@
31
38
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
39
  #define LLAMA_SESSION_VERSION 1
33
40
 
34
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
41
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
35
42
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
43
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
44
  #endif
@@ -65,9 +72,13 @@ extern "C" {
65
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
66
73
 
67
74
  struct llama_context_params {
68
- int n_ctx; // text context
69
- int n_gpu_layers; // number of layers to store in VRAM
70
- int seed; // RNG seed, -1 for random
75
+ int n_ctx; // text context
76
+ int n_batch; // prompt processing batch size
77
+ int n_gpu_layers; // number of layers to store in VRAM
78
+ int main_gpu; // the GPU that is used for scratch and small tensors
79
+ float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
+ int seed; // RNG seed, -1 for random
71
82
 
72
83
  bool f16_kv; // use fp16 for KV cache
73
84
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +105,27 @@ extern "C" {
94
105
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
95
106
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
96
107
  LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
109
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
111
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
112
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
113
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
114
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
115
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
116
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
97
117
  };
98
118
 
119
+ // model quantization parameters
120
+ typedef struct llama_model_quantize_params {
121
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
122
+ enum llama_ftype ftype; // quantize to this llama_ftype
123
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
124
+ bool quantize_output_tensor; // quantize output.weight
125
+ } llama_model_quantize_params;
126
+
99
127
  LLAMA_API struct llama_context_params llama_context_default_params();
128
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
100
129
 
101
130
  LLAMA_API bool llama_mmap_supported();
102
131
  LLAMA_API bool llama_mlock_supported();
@@ -118,14 +147,11 @@ extern "C" {
118
147
  // Frees all allocated memory
119
148
  LLAMA_API void llama_free(struct llama_context * ctx);
120
149
 
121
- // TODO: not great API - very likely to change
122
150
  // Returns 0 on success
123
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
124
151
  LLAMA_API int llama_model_quantize(
125
152
  const char * fname_inp,
126
153
  const char * fname_out,
127
- enum llama_ftype ftype,
128
- int nthread);
154
+ const llama_model_quantize_params * params);
129
155
 
130
156
  // Apply a LoRA adapter to a loaded model
131
157
  // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +199,12 @@ extern "C" {
173
199
  int n_past,
174
200
  int n_threads);
175
201
 
202
+ // Export a static computation graph for context of 511 and batch size of 1
203
+ // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
204
+ // parameters here to keep things simple
205
+ // IMPORTANT: do not use for anything else other than debugging and testing!
206
+ LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
207
+
176
208
  // Convert the provided text into tokens.
177
209
  // The tokens pointer must be large enough to hold the resulting tokens.
178
210
  // Returns the number of tokens on success, no more than n_max_tokens
@@ -189,6 +221,14 @@ extern "C" {
189
221
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
190
222
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
191
223
 
224
+ // Get the vocabulary as output parameters.
225
+ // Returns number of results.
226
+ LLAMA_API int llama_get_vocab(
227
+ const struct llama_context * ctx,
228
+ const char * * strings,
229
+ float * scores,
230
+ int capacity);
231
+
192
232
  // Token logits obtained from the last call to llama_eval()
193
233
  // The logits for the last token are stored in the last row
194
234
  // Can be mutated in order to change the probabilities of the next token
@@ -204,9 +244,9 @@ extern "C" {
204
244
  LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
205
245
 
206
246
  // Special tokens
207
- LLAMA_API llama_token llama_token_bos();
208
- LLAMA_API llama_token llama_token_eos();
209
- LLAMA_API llama_token llama_token_nl();
247
+ LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
248
+ LLAMA_API llama_token llama_token_eos(); // end-of-sentence
249
+ LLAMA_API llama_token llama_token_nl(); // next-line
210
250
 
211
251
  // Sampling functions
212
252
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-ffb06a3'
9
+ LLAMA_CPP_VERSION = 'master-a09f919'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_MAX_DEVICES: Integer
9
+
8
10
  LLAMA_FTYPE_ALL_F32: Integer
9
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
13
15
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
14
16
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
17
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+ LLAMA_FTYPE_MOSTLY_Q2_K: Integer
19
+ LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
20
+ LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
21
+ LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
22
+ LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
23
+ LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
24
+ LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
25
+ LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q6_K: Integer
16
27
 
17
28
  def self?.init_backend: () -> void
18
- def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
29
+ def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
19
30
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
31
  def self?.print_system_info: () -> void
21
32
  def self?.token_bos: () -> Integer
@@ -52,12 +63,14 @@ module LLaMACpp
52
63
  def embeddings: () -> Array[Float]
53
64
  def empty?: () -> bool
54
65
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
66
+ def eval_export: (String) -> bool
55
67
  def free: () -> void
56
68
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
57
69
  def logits: () -> Array[Float]
58
70
  def n_ctx: () -> Integer
59
71
  def n_embd: () -> Integer
60
72
  def n_vocab: () -> Integer
73
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
61
74
  def print_timings: () -> void
62
75
  def reset_timings: () -> void
63
76
  def token_to_str: (Integer) -> String
@@ -92,6 +105,15 @@ module LLaMACpp
92
105
  def logits_all=: (bool) -> bool
93
106
  def n_ctx: () -> Integer
94
107
  def n_ctx=: (Integer) -> Integer
108
+ def n_batch: () -> Integer
109
+ def n_batch=: (Integer) -> Integer
110
+ def n_gpu_layers: () -> Integer
111
+ def n_gpu_layers=: (Integer) -> Integer
112
+ def main_gpu: () -> Integer
113
+ def main_gpu=: (Integer) -> Integer
114
+ def tensor_split: () -> Array[Float]
115
+ def low_vram: () -> bool
116
+ def low_vram=: (bool) -> bool
95
117
  def seed: () -> Integer
96
118
  def seed=: (Integer) -> Integer
97
119
  def use_mlock: () -> bool
@@ -102,6 +124,19 @@ module LLaMACpp
102
124
  def vocab_only=: (bool) -> bool
103
125
  end
104
126
 
127
+ class ModelQuantizeParams
128
+ public
129
+
130
+ def n_thread: () -> Integer
131
+ def n_thread=: (Integer) -> Integer
132
+ def ftype: () -> Integer
133
+ def ftype=: (Integer) -> Integer
134
+ def allow_quantization: () -> bool
135
+ def allow_quantization=: (bool) -> bool
136
+ def quantize_output_tensor: () -> bool
137
+ def quantize_output_tensor=: (bool) -> bool
138
+ end
139
+
105
140
  class Params = ContextParams
106
141
 
107
142
  class Client
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-03 00:00:00.000000000 Z
11
+ date: 2023-06-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -22,15 +22,23 @@ files:
22
22
  - CODE_OF_CONDUCT.md
23
23
  - LICENSE.txt
24
24
  - README.md
25
+ - examples/README.md
26
+ - examples/chat.rb
25
27
  - ext/llama_cpp/extconf.rb
26
28
  - ext/llama_cpp/llama_cpp.cpp
27
29
  - ext/llama_cpp/llama_cpp.h
28
30
  - ext/llama_cpp/src/LICENSE
31
+ - ext/llama_cpp/src/ggml-cuda.cu
29
32
  - ext/llama_cpp/src/ggml-cuda.h
33
+ - ext/llama_cpp/src/ggml-metal.h
34
+ - ext/llama_cpp/src/ggml-metal.m
35
+ - ext/llama_cpp/src/ggml-metal.metal
30
36
  - ext/llama_cpp/src/ggml-opencl.cpp
31
37
  - ext/llama_cpp/src/ggml-opencl.h
32
38
  - ext/llama_cpp/src/ggml.c
33
39
  - ext/llama_cpp/src/ggml.h
40
+ - ext/llama_cpp/src/k_quants.c
41
+ - ext/llama_cpp/src/k_quants.h
34
42
  - ext/llama_cpp/src/llama-util.h
35
43
  - ext/llama_cpp/src/llama.cpp
36
44
  - ext/llama_cpp/src/llama.h