llama_cpp 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +106 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +2038 -895
- data/ext/llama_cpp/src/ggml.h +21 -1
- data/ext/llama_cpp/src/llama.cpp +376 -62
- data/ext/llama_cpp/src/llama.h +17 -1
- data/ext/llama_cpp/src/llama_util.h +22 -16
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +13 -1
- metadata +3 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,8 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
75
77
|
};
|
76
78
|
|
77
79
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +93,24 @@ extern "C" {
|
|
91
93
|
|
92
94
|
// TODO: not great API - very likely to change
|
93
95
|
// Returns 0 on success
|
96
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
97
|
LLAMA_API int llama_model_quantize(
|
95
98
|
const char * fname_inp,
|
96
99
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
100
|
+
enum llama_ftype ftype,
|
101
|
+
int nthread);
|
102
|
+
|
103
|
+
// Apply a LoRA adapter to a loaded model
|
104
|
+
// path_base_model is the path to a higher quality model to use as a base for
|
105
|
+
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
106
|
+
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
107
|
+
// will be applied on top of the previous one
|
108
|
+
// Returns 0 on success
|
109
|
+
LLAMA_API int llama_apply_lora_from_file(
|
110
|
+
struct llama_context * ctx,
|
111
|
+
const char * path_lora,
|
112
|
+
const char * path_base_model,
|
113
|
+
int n_threads);
|
98
114
|
|
99
115
|
// Returns the KV cache that will contain the context for the
|
100
116
|
// ongoing prediction with the model.
|
@@ -43,8 +43,12 @@
|
|
43
43
|
} while (0)
|
44
44
|
|
45
45
|
#ifdef __GNUC__
|
46
|
+
#ifdef __MINGW32__
|
47
|
+
__attribute__((format(gnu_printf, 1, 2)))
|
48
|
+
#else
|
46
49
|
__attribute__((format(printf, 1, 2)))
|
47
50
|
#endif
|
51
|
+
#endif
|
48
52
|
static std::string format(const char * fmt, ...) {
|
49
53
|
va_list ap, ap2;
|
50
54
|
va_start(ap, fmt);
|
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
|
57
61
|
va_end(ap2);
|
58
62
|
va_end(ap);
|
59
63
|
return std::string(buf.data(), size);
|
60
|
-
}
|
64
|
+
}
|
61
65
|
|
62
66
|
struct llama_file {
|
63
67
|
// use FILE * so we don't have to re-open the file to mmap
|
@@ -164,7 +168,7 @@ struct llama_mmap {
|
|
164
168
|
#ifdef _POSIX_MAPPED_FILES
|
165
169
|
static constexpr bool SUPPORTED = true;
|
166
170
|
|
167
|
-
llama_mmap(struct llama_file * file) {
|
171
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
168
172
|
size = file->size;
|
169
173
|
int fd = fileno(file->fp);
|
170
174
|
int flags = MAP_SHARED;
|
@@ -172,15 +176,16 @@ struct llama_mmap {
|
|
172
176
|
flags |= MAP_POPULATE;
|
173
177
|
#endif
|
174
178
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
175
|
-
close(fd);
|
176
179
|
if (addr == MAP_FAILED) {
|
177
180
|
throw format("mmap failed: %s", strerror(errno));
|
178
181
|
}
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
183
|
+
if (prefetch) {
|
184
|
+
// Advise the kernel to preload the mapped memory
|
185
|
+
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
186
|
+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
187
|
+
strerror(errno));
|
188
|
+
}
|
184
189
|
}
|
185
190
|
}
|
186
191
|
|
@@ -190,14 +195,13 @@ struct llama_mmap {
|
|
190
195
|
#elif defined(_WIN32)
|
191
196
|
static constexpr bool SUPPORTED = true;
|
192
197
|
|
193
|
-
llama_mmap(struct llama_file * file) {
|
198
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
194
199
|
size = file->size;
|
195
200
|
|
196
201
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
197
202
|
|
198
203
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
199
204
|
DWORD error = GetLastError();
|
200
|
-
CloseHandle(hFile);
|
201
205
|
|
202
206
|
if (hMapping == NULL) {
|
203
207
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
@@ -212,13 +216,15 @@ struct llama_mmap {
|
|
212
216
|
}
|
213
217
|
|
214
218
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
219
|
+
if (prefetch) {
|
220
|
+
// Advise the kernel to preload the mapped memory
|
221
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
222
|
+
range.VirtualAddress = addr;
|
223
|
+
range.NumberOfBytes = (SIZE_T)size;
|
224
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
225
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
226
|
+
llama_format_win_err(GetLastError()).c_str());
|
227
|
+
}
|
222
228
|
}
|
223
229
|
#else
|
224
230
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-12b5900'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
|
|
17
17
|
# @param n_threads [Integer]
|
18
18
|
# @return [String]
|
19
19
|
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
20
|
+
spaced_prompt = " #{prompt}"
|
21
21
|
|
22
|
-
embd_input = context.tokenize(text:
|
22
|
+
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
23
|
|
24
24
|
n_ctx = context.n_ctx
|
25
25
|
last_n_tokens = [0] * n_ctx
|
@@ -71,6 +71,6 @@ module LLaMACpp
|
|
71
71
|
break if embd[-1] == LLaMACpp.token_eos
|
72
72
|
end
|
73
73
|
|
74
|
-
output.join.delete_prefix(
|
74
|
+
output.join.delete_prefix(spaced_prompt).strip
|
75
75
|
end
|
76
76
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,10 +5,21 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
|
16
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
8
17
|
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
9
18
|
def self?.print_system_info: () -> void
|
10
19
|
def self?.token_bos: () -> Integer
|
11
20
|
def self?.token_eos: () -> Integer
|
21
|
+
def self?.mmap_supported?: () -> bool
|
22
|
+
def self?.mlock_supported?: () -> bool
|
12
23
|
|
13
24
|
class Context
|
14
25
|
public
|
@@ -16,7 +27,7 @@ module LLaMACpp
|
|
16
27
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
17
28
|
| () -> void
|
18
29
|
def embeddings: () -> Array[Float]
|
19
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) ->
|
30
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
20
31
|
def free: () -> void
|
21
32
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
22
33
|
def logits: () -> Array[Float]
|
@@ -28,6 +39,7 @@ module LLaMACpp
|
|
28
39
|
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
29
40
|
def token_to_str: (Integer) -> String
|
30
41
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
42
|
+
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
31
43
|
end
|
32
44
|
|
33
45
|
class ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
29
30
|
- ext/llama_cpp/src/ggml.c
|
30
31
|
- ext/llama_cpp/src/ggml.h
|
31
32
|
- ext/llama_cpp/src/llama.cpp
|