llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,13 @@
1
1
  #ifndef LLAMA_H
2
2
  #define LLAMA_H
3
3
 
4
+ #include "ggml.h"
5
+ #ifdef GGML_USE_CUBLAS
6
+ #include "ggml-cuda.h"
7
+ #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
+ #else
9
+ #define LLAMA_MAX_DEVICES 1
10
+ #endif // GGML_USE_CUBLAS
4
11
  #include <stddef.h>
5
12
  #include <stdint.h>
6
13
  #include <stdbool.h>
@@ -31,7 +38,7 @@
31
38
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
39
  #define LLAMA_SESSION_VERSION 1
33
40
 
34
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
41
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
35
42
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
43
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
44
  #endif
@@ -65,9 +72,13 @@ extern "C" {
65
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
66
73
 
67
74
  struct llama_context_params {
68
- int n_ctx; // text context
69
- int n_gpu_layers; // number of layers to store in VRAM
70
- int seed; // RNG seed, -1 for random
75
+ int n_ctx; // text context
76
+ int n_batch; // prompt processing batch size
77
+ int n_gpu_layers; // number of layers to store in VRAM
78
+ int main_gpu; // the GPU that is used for scratch and small tensors
79
+ float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
+ int seed; // RNG seed, -1 for random
71
82
 
72
83
  bool f16_kv; // use fp16 for KV cache
73
84
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +105,27 @@ extern "C" {
94
105
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
95
106
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
96
107
  LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
109
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
111
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
112
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
113
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
114
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
115
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
116
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
97
117
  };
98
118
 
119
+ // model quantization parameters
120
+ typedef struct llama_model_quantize_params {
121
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
122
+ enum llama_ftype ftype; // quantize to this llama_ftype
123
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
124
+ bool quantize_output_tensor; // quantize output.weight
125
+ } llama_model_quantize_params;
126
+
99
127
  LLAMA_API struct llama_context_params llama_context_default_params();
128
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
100
129
 
101
130
  LLAMA_API bool llama_mmap_supported();
102
131
  LLAMA_API bool llama_mlock_supported();
@@ -118,14 +147,11 @@ extern "C" {
118
147
  // Frees all allocated memory
119
148
  LLAMA_API void llama_free(struct llama_context * ctx);
120
149
 
121
- // TODO: not great API - very likely to change
122
150
  // Returns 0 on success
123
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
124
151
  LLAMA_API int llama_model_quantize(
125
152
  const char * fname_inp,
126
153
  const char * fname_out,
127
- enum llama_ftype ftype,
128
- int nthread);
154
+ const llama_model_quantize_params * params);
129
155
 
130
156
  // Apply a LoRA adapter to a loaded model
131
157
  // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +199,12 @@ extern "C" {
173
199
  int n_past,
174
200
  int n_threads);
175
201
 
202
+ // Export a static computation graph for context of 511 and batch size of 1
203
+ // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
204
+ // parameters here to keep things simple
205
+ // IMPORTANT: do not use for anything else other than debugging and testing!
206
+ LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
207
+
176
208
  // Convert the provided text into tokens.
177
209
  // The tokens pointer must be large enough to hold the resulting tokens.
178
210
  // Returns the number of tokens on success, no more than n_max_tokens
@@ -189,6 +221,14 @@ extern "C" {
189
221
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
190
222
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
191
223
 
224
+ // Get the vocabulary as output parameters.
225
+ // Returns number of results.
226
+ LLAMA_API int llama_get_vocab(
227
+ const struct llama_context * ctx,
228
+ const char * * strings,
229
+ float * scores,
230
+ int capacity);
231
+
192
232
  // Token logits obtained from the last call to llama_eval()
193
233
  // The logits for the last token are stored in the last row
194
234
  // Can be mutated in order to change the probabilities of the next token
@@ -204,9 +244,9 @@ extern "C" {
204
244
  LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
205
245
 
206
246
  // Special tokens
207
- LLAMA_API llama_token llama_token_bos();
208
- LLAMA_API llama_token llama_token_eos();
209
- LLAMA_API llama_token llama_token_nl();
247
+ LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
248
+ LLAMA_API llama_token llama_token_eos(); // end-of-sentence
249
+ LLAMA_API llama_token llama_token_nl(); // next-line
210
250
 
211
251
  // Sampling functions
212
252
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-ffb06a3'
9
+ LLAMA_CPP_VERSION = 'master-a09f919'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_MAX_DEVICES: Integer
9
+
8
10
  LLAMA_FTYPE_ALL_F32: Integer
9
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
13
15
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
14
16
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
17
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+ LLAMA_FTYPE_MOSTLY_Q2_K: Integer
19
+ LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
20
+ LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
21
+ LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
22
+ LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
23
+ LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
24
+ LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
25
+ LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q6_K: Integer
16
27
 
17
28
  def self?.init_backend: () -> void
18
- def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
29
+ def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
19
30
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
31
  def self?.print_system_info: () -> void
21
32
  def self?.token_bos: () -> Integer
@@ -52,12 +63,14 @@ module LLaMACpp
52
63
  def embeddings: () -> Array[Float]
53
64
  def empty?: () -> bool
54
65
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
66
+ def eval_export: (String) -> bool
55
67
  def free: () -> void
56
68
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
57
69
  def logits: () -> Array[Float]
58
70
  def n_ctx: () -> Integer
59
71
  def n_embd: () -> Integer
60
72
  def n_vocab: () -> Integer
73
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
61
74
  def print_timings: () -> void
62
75
  def reset_timings: () -> void
63
76
  def token_to_str: (Integer) -> String
@@ -92,6 +105,15 @@ module LLaMACpp
92
105
  def logits_all=: (bool) -> bool
93
106
  def n_ctx: () -> Integer
94
107
  def n_ctx=: (Integer) -> Integer
108
+ def n_batch: () -> Integer
109
+ def n_batch=: (Integer) -> Integer
110
+ def n_gpu_layers: () -> Integer
111
+ def n_gpu_layers=: (Integer) -> Integer
112
+ def main_gpu: () -> Integer
113
+ def main_gpu=: (Integer) -> Integer
114
+ def tensor_split: () -> Array[Float]
115
+ def low_vram: () -> bool
116
+ def low_vram=: (bool) -> bool
95
117
  def seed: () -> Integer
96
118
  def seed=: (Integer) -> Integer
97
119
  def use_mlock: () -> bool
@@ -102,6 +124,19 @@ module LLaMACpp
102
124
  def vocab_only=: (bool) -> bool
103
125
  end
104
126
 
127
+ class ModelQuantizeParams
128
+ public
129
+
130
+ def n_thread: () -> Integer
131
+ def n_thread=: (Integer) -> Integer
132
+ def ftype: () -> Integer
133
+ def ftype=: (Integer) -> Integer
134
+ def allow_quantization: () -> bool
135
+ def allow_quantization=: (bool) -> bool
136
+ def quantize_output_tensor: () -> bool
137
+ def quantize_output_tensor=: (bool) -> bool
138
+ end
139
+
105
140
  class Params = ContextParams
106
141
 
107
142
  class Client
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-03 00:00:00.000000000 Z
11
+ date: 2023-06-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -22,15 +22,23 @@ files:
22
22
  - CODE_OF_CONDUCT.md
23
23
  - LICENSE.txt
24
24
  - README.md
25
+ - examples/README.md
26
+ - examples/chat.rb
25
27
  - ext/llama_cpp/extconf.rb
26
28
  - ext/llama_cpp/llama_cpp.cpp
27
29
  - ext/llama_cpp/llama_cpp.h
28
30
  - ext/llama_cpp/src/LICENSE
31
+ - ext/llama_cpp/src/ggml-cuda.cu
29
32
  - ext/llama_cpp/src/ggml-cuda.h
33
+ - ext/llama_cpp/src/ggml-metal.h
34
+ - ext/llama_cpp/src/ggml-metal.m
35
+ - ext/llama_cpp/src/ggml-metal.metal
30
36
  - ext/llama_cpp/src/ggml-opencl.cpp
31
37
  - ext/llama_cpp/src/ggml-opencl.h
32
38
  - ext/llama_cpp/src/ggml.c
33
39
  - ext/llama_cpp/src/ggml.h
40
+ - ext/llama_cpp/src/k_quants.c
41
+ - ext/llama_cpp/src/k_quants.h
34
42
  - ext/llama_cpp/src/llama-util.h
35
43
  - ext/llama_cpp/src/llama.cpp
36
44
  - ext/llama_cpp/src/llama.h