llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
#ifndef LLAMA_H
|
2
2
|
#define LLAMA_H
|
3
3
|
|
4
|
+
#include "ggml.h"
|
5
|
+
#ifdef GGML_USE_CUBLAS
|
6
|
+
#include "ggml-cuda.h"
|
7
|
+
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
8
|
+
#else
|
9
|
+
#define LLAMA_MAX_DEVICES 1
|
10
|
+
#endif // GGML_USE_CUBLAS
|
4
11
|
#include <stddef.h>
|
5
12
|
#include <stdint.h>
|
6
13
|
#include <stdbool.h>
|
@@ -31,7 +38,7 @@
|
|
31
38
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
39
|
#define LLAMA_SESSION_VERSION 1
|
33
40
|
|
34
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
41
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
35
42
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
43
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
44
|
#endif
|
@@ -65,9 +72,13 @@ extern "C" {
|
|
65
72
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
66
73
|
|
67
74
|
struct llama_context_params {
|
68
|
-
int n_ctx;
|
69
|
-
int
|
70
|
-
int
|
75
|
+
int n_ctx; // text context
|
76
|
+
int n_batch; // prompt processing batch size
|
77
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
78
|
+
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
|
+
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
81
|
+
int seed; // RNG seed, -1 for random
|
71
82
|
|
72
83
|
bool f16_kv; // use fp16 for KV cache
|
73
84
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -94,9 +105,27 @@ extern "C" {
|
|
94
105
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
95
106
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
96
107
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
109
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
110
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
111
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
112
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
113
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
114
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
115
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
116
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
97
117
|
};
|
98
118
|
|
119
|
+
// model quantization parameters
|
120
|
+
typedef struct llama_model_quantize_params {
|
121
|
+
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
122
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
123
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
124
|
+
bool quantize_output_tensor; // quantize output.weight
|
125
|
+
} llama_model_quantize_params;
|
126
|
+
|
99
127
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
128
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
100
129
|
|
101
130
|
LLAMA_API bool llama_mmap_supported();
|
102
131
|
LLAMA_API bool llama_mlock_supported();
|
@@ -118,14 +147,11 @@ extern "C" {
|
|
118
147
|
// Frees all allocated memory
|
119
148
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
120
149
|
|
121
|
-
// TODO: not great API - very likely to change
|
122
150
|
// Returns 0 on success
|
123
|
-
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
124
151
|
LLAMA_API int llama_model_quantize(
|
125
152
|
const char * fname_inp,
|
126
153
|
const char * fname_out,
|
127
|
-
|
128
|
-
int nthread);
|
154
|
+
const llama_model_quantize_params * params);
|
129
155
|
|
130
156
|
// Apply a LoRA adapter to a loaded model
|
131
157
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -173,6 +199,12 @@ extern "C" {
|
|
173
199
|
int n_past,
|
174
200
|
int n_threads);
|
175
201
|
|
202
|
+
// Export a static computation graph for context of 511 and batch size of 1
|
203
|
+
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
204
|
+
// parameters here to keep things simple
|
205
|
+
// IMPORTANT: do not use for anything else other than debugging and testing!
|
206
|
+
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
207
|
+
|
176
208
|
// Convert the provided text into tokens.
|
177
209
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
178
210
|
// Returns the number of tokens on success, no more than n_max_tokens
|
@@ -189,6 +221,14 @@ extern "C" {
|
|
189
221
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
190
222
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
191
223
|
|
224
|
+
// Get the vocabulary as output parameters.
|
225
|
+
// Returns number of results.
|
226
|
+
LLAMA_API int llama_get_vocab(
|
227
|
+
const struct llama_context * ctx,
|
228
|
+
const char * * strings,
|
229
|
+
float * scores,
|
230
|
+
int capacity);
|
231
|
+
|
192
232
|
// Token logits obtained from the last call to llama_eval()
|
193
233
|
// The logits for the last token are stored in the last row
|
194
234
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -204,9 +244,9 @@ extern "C" {
|
|
204
244
|
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
205
245
|
|
206
246
|
// Special tokens
|
207
|
-
LLAMA_API llama_token llama_token_bos();
|
208
|
-
LLAMA_API llama_token llama_token_eos();
|
209
|
-
LLAMA_API llama_token llama_token_nl();
|
247
|
+
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
248
|
+
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
249
|
+
LLAMA_API llama_token llama_token_nl(); // next-line
|
210
250
|
|
211
251
|
// Sampling functions
|
212
252
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1
|
6
|
+
VERSION = '0.2.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-a09f919'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_MAX_DEVICES: Integer
|
9
|
+
|
8
10
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
11
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
12
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -13,9 +15,18 @@ module LLaMACpp
|
|
13
15
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
14
16
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
15
17
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
18
|
+
LLAMA_FTYPE_MOSTLY_Q2_K: Integer
|
19
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
|
20
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
|
21
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
|
22
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
|
23
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
|
24
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
16
27
|
|
17
28
|
def self?.init_backend: () -> void
|
18
|
-
def self?.model_quantize: (input_path: String, output_path: String,
|
29
|
+
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
19
30
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
20
31
|
def self?.print_system_info: () -> void
|
21
32
|
def self?.token_bos: () -> Integer
|
@@ -52,12 +63,14 @@ module LLaMACpp
|
|
52
63
|
def embeddings: () -> Array[Float]
|
53
64
|
def empty?: () -> bool
|
54
65
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
66
|
+
def eval_export: (String) -> bool
|
55
67
|
def free: () -> void
|
56
68
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
57
69
|
def logits: () -> Array[Float]
|
58
70
|
def n_ctx: () -> Integer
|
59
71
|
def n_embd: () -> Integer
|
60
72
|
def n_vocab: () -> Integer
|
73
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
61
74
|
def print_timings: () -> void
|
62
75
|
def reset_timings: () -> void
|
63
76
|
def token_to_str: (Integer) -> String
|
@@ -92,6 +105,15 @@ module LLaMACpp
|
|
92
105
|
def logits_all=: (bool) -> bool
|
93
106
|
def n_ctx: () -> Integer
|
94
107
|
def n_ctx=: (Integer) -> Integer
|
108
|
+
def n_batch: () -> Integer
|
109
|
+
def n_batch=: (Integer) -> Integer
|
110
|
+
def n_gpu_layers: () -> Integer
|
111
|
+
def n_gpu_layers=: (Integer) -> Integer
|
112
|
+
def main_gpu: () -> Integer
|
113
|
+
def main_gpu=: (Integer) -> Integer
|
114
|
+
def tensor_split: () -> Array[Float]
|
115
|
+
def low_vram: () -> bool
|
116
|
+
def low_vram=: (bool) -> bool
|
95
117
|
def seed: () -> Integer
|
96
118
|
def seed=: (Integer) -> Integer
|
97
119
|
def use_mlock: () -> bool
|
@@ -102,6 +124,19 @@ module LLaMACpp
|
|
102
124
|
def vocab_only=: (bool) -> bool
|
103
125
|
end
|
104
126
|
|
127
|
+
class ModelQuantizeParams
|
128
|
+
public
|
129
|
+
|
130
|
+
def n_thread: () -> Integer
|
131
|
+
def n_thread=: (Integer) -> Integer
|
132
|
+
def ftype: () -> Integer
|
133
|
+
def ftype=: (Integer) -> Integer
|
134
|
+
def allow_quantization: () -> bool
|
135
|
+
def allow_quantization=: (bool) -> bool
|
136
|
+
def quantize_output_tensor: () -> bool
|
137
|
+
def quantize_output_tensor=: (bool) -> bool
|
138
|
+
end
|
139
|
+
|
105
140
|
class Params = ContextParams
|
106
141
|
|
107
142
|
class Client
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -22,15 +22,23 @@ files:
|
|
22
22
|
- CODE_OF_CONDUCT.md
|
23
23
|
- LICENSE.txt
|
24
24
|
- README.md
|
25
|
+
- examples/README.md
|
26
|
+
- examples/chat.rb
|
25
27
|
- ext/llama_cpp/extconf.rb
|
26
28
|
- ext/llama_cpp/llama_cpp.cpp
|
27
29
|
- ext/llama_cpp/llama_cpp.h
|
28
30
|
- ext/llama_cpp/src/LICENSE
|
31
|
+
- ext/llama_cpp/src/ggml-cuda.cu
|
29
32
|
- ext/llama_cpp/src/ggml-cuda.h
|
33
|
+
- ext/llama_cpp/src/ggml-metal.h
|
34
|
+
- ext/llama_cpp/src/ggml-metal.m
|
35
|
+
- ext/llama_cpp/src/ggml-metal.metal
|
30
36
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
37
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
38
|
- ext/llama_cpp/src/ggml.c
|
33
39
|
- ext/llama_cpp/src/ggml.h
|
40
|
+
- ext/llama_cpp/src/k_quants.c
|
41
|
+
- ext/llama_cpp/src/k_quants.h
|
34
42
|
- ext/llama_cpp/src/llama-util.h
|
35
43
|
- ext/llama_cpp/src/llama.cpp
|
36
44
|
- ext/llama_cpp/src/llama.h
|