llama_cpp 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
#ifndef LLAMA_H
|
2
2
|
#define LLAMA_H
|
3
3
|
|
4
|
+
#include "ggml.h"
|
5
|
+
#ifdef GGML_USE_CUBLAS
|
6
|
+
#include "ggml-cuda.h"
|
7
|
+
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
8
|
+
#else
|
9
|
+
#define LLAMA_MAX_DEVICES 1
|
10
|
+
#endif // GGML_USE_CUBLAS
|
4
11
|
#include <stddef.h>
|
5
12
|
#include <stdint.h>
|
6
13
|
#include <stdbool.h>
|
@@ -31,7 +38,7 @@
|
|
31
38
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
39
|
#define LLAMA_SESSION_VERSION 1
|
33
40
|
|
34
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
41
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
35
42
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
43
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
44
|
#endif
|
@@ -65,9 +72,13 @@ extern "C" {
|
|
65
72
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
66
73
|
|
67
74
|
struct llama_context_params {
|
68
|
-
int n_ctx;
|
69
|
-
int
|
70
|
-
int
|
75
|
+
int n_ctx; // text context
|
76
|
+
int n_batch; // prompt processing batch size
|
77
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
78
|
+
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
|
+
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
81
|
+
int seed; // RNG seed, -1 for random
|
71
82
|
|
72
83
|
bool f16_kv; // use fp16 for KV cache
|
73
84
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -94,9 +105,27 @@ extern "C" {
|
|
94
105
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
95
106
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
96
107
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
109
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
110
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
111
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
112
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
113
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
114
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
115
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
116
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
97
117
|
};
|
98
118
|
|
119
|
+
// model quantization parameters
|
120
|
+
typedef struct llama_model_quantize_params {
|
121
|
+
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
122
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
123
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
124
|
+
bool quantize_output_tensor; // quantize output.weight
|
125
|
+
} llama_model_quantize_params;
|
126
|
+
|
99
127
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
128
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
100
129
|
|
101
130
|
LLAMA_API bool llama_mmap_supported();
|
102
131
|
LLAMA_API bool llama_mlock_supported();
|
@@ -118,14 +147,11 @@ extern "C" {
|
|
118
147
|
// Frees all allocated memory
|
119
148
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
120
149
|
|
121
|
-
// TODO: not great API - very likely to change
|
122
150
|
// Returns 0 on success
|
123
|
-
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
124
151
|
LLAMA_API int llama_model_quantize(
|
125
152
|
const char * fname_inp,
|
126
153
|
const char * fname_out,
|
127
|
-
|
128
|
-
int nthread);
|
154
|
+
const llama_model_quantize_params * params);
|
129
155
|
|
130
156
|
// Apply a LoRA adapter to a loaded model
|
131
157
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -173,6 +199,12 @@ extern "C" {
|
|
173
199
|
int n_past,
|
174
200
|
int n_threads);
|
175
201
|
|
202
|
+
// Export a static computation graph for context of 511 and batch size of 1
|
203
|
+
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
204
|
+
// parameters here to keep things simple
|
205
|
+
// IMPORTANT: do not use for anything else other than debugging and testing!
|
206
|
+
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
207
|
+
|
176
208
|
// Convert the provided text into tokens.
|
177
209
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
178
210
|
// Returns the number of tokens on success, no more than n_max_tokens
|
@@ -189,6 +221,14 @@ extern "C" {
|
|
189
221
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
190
222
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
191
223
|
|
224
|
+
// Get the vocabulary as output parameters.
|
225
|
+
// Returns number of results.
|
226
|
+
LLAMA_API int llama_get_vocab(
|
227
|
+
const struct llama_context * ctx,
|
228
|
+
const char * * strings,
|
229
|
+
float * scores,
|
230
|
+
int capacity);
|
231
|
+
|
192
232
|
// Token logits obtained from the last call to llama_eval()
|
193
233
|
// The logits for the last token are stored in the last row
|
194
234
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -204,9 +244,9 @@ extern "C" {
|
|
204
244
|
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
205
245
|
|
206
246
|
// Special tokens
|
207
|
-
LLAMA_API llama_token llama_token_bos();
|
208
|
-
LLAMA_API llama_token llama_token_eos();
|
209
|
-
LLAMA_API llama_token llama_token_nl();
|
247
|
+
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
248
|
+
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
249
|
+
LLAMA_API llama_token llama_token_nl(); // next-line
|
210
250
|
|
211
251
|
// Sampling functions
|
212
252
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1
|
6
|
+
VERSION = '0.2.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-a09f919'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_MAX_DEVICES: Integer
|
9
|
+
|
8
10
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
11
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
12
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -13,9 +15,18 @@ module LLaMACpp
|
|
13
15
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
14
16
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
15
17
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
18
|
+
LLAMA_FTYPE_MOSTLY_Q2_K: Integer
|
19
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
|
20
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
|
21
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
|
22
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
|
23
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
|
24
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
16
27
|
|
17
28
|
def self?.init_backend: () -> void
|
18
|
-
def self?.model_quantize: (input_path: String, output_path: String,
|
29
|
+
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
19
30
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
20
31
|
def self?.print_system_info: () -> void
|
21
32
|
def self?.token_bos: () -> Integer
|
@@ -52,12 +63,14 @@ module LLaMACpp
|
|
52
63
|
def embeddings: () -> Array[Float]
|
53
64
|
def empty?: () -> bool
|
54
65
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
66
|
+
def eval_export: (String) -> bool
|
55
67
|
def free: () -> void
|
56
68
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
57
69
|
def logits: () -> Array[Float]
|
58
70
|
def n_ctx: () -> Integer
|
59
71
|
def n_embd: () -> Integer
|
60
72
|
def n_vocab: () -> Integer
|
73
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
61
74
|
def print_timings: () -> void
|
62
75
|
def reset_timings: () -> void
|
63
76
|
def token_to_str: (Integer) -> String
|
@@ -92,6 +105,15 @@ module LLaMACpp
|
|
92
105
|
def logits_all=: (bool) -> bool
|
93
106
|
def n_ctx: () -> Integer
|
94
107
|
def n_ctx=: (Integer) -> Integer
|
108
|
+
def n_batch: () -> Integer
|
109
|
+
def n_batch=: (Integer) -> Integer
|
110
|
+
def n_gpu_layers: () -> Integer
|
111
|
+
def n_gpu_layers=: (Integer) -> Integer
|
112
|
+
def main_gpu: () -> Integer
|
113
|
+
def main_gpu=: (Integer) -> Integer
|
114
|
+
def tensor_split: () -> Array[Float]
|
115
|
+
def low_vram: () -> bool
|
116
|
+
def low_vram=: (bool) -> bool
|
95
117
|
def seed: () -> Integer
|
96
118
|
def seed=: (Integer) -> Integer
|
97
119
|
def use_mlock: () -> bool
|
@@ -102,6 +124,19 @@ module LLaMACpp
|
|
102
124
|
def vocab_only=: (bool) -> bool
|
103
125
|
end
|
104
126
|
|
127
|
+
class ModelQuantizeParams
|
128
|
+
public
|
129
|
+
|
130
|
+
def n_thread: () -> Integer
|
131
|
+
def n_thread=: (Integer) -> Integer
|
132
|
+
def ftype: () -> Integer
|
133
|
+
def ftype=: (Integer) -> Integer
|
134
|
+
def allow_quantization: () -> bool
|
135
|
+
def allow_quantization=: (bool) -> bool
|
136
|
+
def quantize_output_tensor: () -> bool
|
137
|
+
def quantize_output_tensor=: (bool) -> bool
|
138
|
+
end
|
139
|
+
|
105
140
|
class Params = ContextParams
|
106
141
|
|
107
142
|
class Client
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -22,15 +22,23 @@ files:
|
|
22
22
|
- CODE_OF_CONDUCT.md
|
23
23
|
- LICENSE.txt
|
24
24
|
- README.md
|
25
|
+
- examples/README.md
|
26
|
+
- examples/chat.rb
|
25
27
|
- ext/llama_cpp/extconf.rb
|
26
28
|
- ext/llama_cpp/llama_cpp.cpp
|
27
29
|
- ext/llama_cpp/llama_cpp.h
|
28
30
|
- ext/llama_cpp/src/LICENSE
|
31
|
+
- ext/llama_cpp/src/ggml-cuda.cu
|
29
32
|
- ext/llama_cpp/src/ggml-cuda.h
|
33
|
+
- ext/llama_cpp/src/ggml-metal.h
|
34
|
+
- ext/llama_cpp/src/ggml-metal.m
|
35
|
+
- ext/llama_cpp/src/ggml-metal.metal
|
30
36
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
37
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
38
|
- ext/llama_cpp/src/ggml.c
|
33
39
|
- ext/llama_cpp/src/ggml.h
|
40
|
+
- ext/llama_cpp/src/k_quants.c
|
41
|
+
- ext/llama_cpp/src/k_quants.h
|
34
42
|
- ext/llama_cpp/src/llama-util.h
|
35
43
|
- ext/llama_cpp/src/llama.cpp
|
36
44
|
- ext/llama_cpp/src/llama.h
|