llama_cpp 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +106 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +2038 -895
- data/ext/llama_cpp/src/ggml.h +21 -1
- data/ext/llama_cpp/src/llama.cpp +376 -62
- data/ext/llama_cpp/src/llama.h +17 -1
- data/ext/llama_cpp/src/llama_util.h +22 -16
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +13 -1
- metadata +3 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,8 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
75
77
|
};
|
76
78
|
|
77
79
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +93,24 @@ extern "C" {
|
|
91
93
|
|
92
94
|
// TODO: not great API - very likely to change
|
93
95
|
// Returns 0 on success
|
96
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
97
|
LLAMA_API int llama_model_quantize(
|
95
98
|
const char * fname_inp,
|
96
99
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
100
|
+
enum llama_ftype ftype,
|
101
|
+
int nthread);
|
102
|
+
|
103
|
+
// Apply a LoRA adapter to a loaded model
|
104
|
+
// path_base_model is the path to a higher quality model to use as a base for
|
105
|
+
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
106
|
+
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
107
|
+
// will be applied on top of the previous one
|
108
|
+
// Returns 0 on success
|
109
|
+
LLAMA_API int llama_apply_lora_from_file(
|
110
|
+
struct llama_context * ctx,
|
111
|
+
const char * path_lora,
|
112
|
+
const char * path_base_model,
|
113
|
+
int n_threads);
|
98
114
|
|
99
115
|
// Returns the KV cache that will contain the context for the
|
100
116
|
// ongoing prediction with the model.
|
@@ -43,8 +43,12 @@
|
|
43
43
|
} while (0)
|
44
44
|
|
45
45
|
#ifdef __GNUC__
|
46
|
+
#ifdef __MINGW32__
|
47
|
+
__attribute__((format(gnu_printf, 1, 2)))
|
48
|
+
#else
|
46
49
|
__attribute__((format(printf, 1, 2)))
|
47
50
|
#endif
|
51
|
+
#endif
|
48
52
|
static std::string format(const char * fmt, ...) {
|
49
53
|
va_list ap, ap2;
|
50
54
|
va_start(ap, fmt);
|
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
|
57
61
|
va_end(ap2);
|
58
62
|
va_end(ap);
|
59
63
|
return std::string(buf.data(), size);
|
60
|
-
}
|
64
|
+
}
|
61
65
|
|
62
66
|
struct llama_file {
|
63
67
|
// use FILE * so we don't have to re-open the file to mmap
|
@@ -164,7 +168,7 @@ struct llama_mmap {
|
|
164
168
|
#ifdef _POSIX_MAPPED_FILES
|
165
169
|
static constexpr bool SUPPORTED = true;
|
166
170
|
|
167
|
-
llama_mmap(struct llama_file * file) {
|
171
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
168
172
|
size = file->size;
|
169
173
|
int fd = fileno(file->fp);
|
170
174
|
int flags = MAP_SHARED;
|
@@ -172,15 +176,16 @@ struct llama_mmap {
|
|
172
176
|
flags |= MAP_POPULATE;
|
173
177
|
#endif
|
174
178
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
175
|
-
close(fd);
|
176
179
|
if (addr == MAP_FAILED) {
|
177
180
|
throw format("mmap failed: %s", strerror(errno));
|
178
181
|
}
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
183
|
+
if (prefetch) {
|
184
|
+
// Advise the kernel to preload the mapped memory
|
185
|
+
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
186
|
+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
187
|
+
strerror(errno));
|
188
|
+
}
|
184
189
|
}
|
185
190
|
}
|
186
191
|
|
@@ -190,14 +195,13 @@ struct llama_mmap {
|
|
190
195
|
#elif defined(_WIN32)
|
191
196
|
static constexpr bool SUPPORTED = true;
|
192
197
|
|
193
|
-
llama_mmap(struct llama_file * file) {
|
198
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
194
199
|
size = file->size;
|
195
200
|
|
196
201
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
197
202
|
|
198
203
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
199
204
|
DWORD error = GetLastError();
|
200
|
-
CloseHandle(hFile);
|
201
205
|
|
202
206
|
if (hMapping == NULL) {
|
203
207
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
@@ -212,13 +216,15 @@ struct llama_mmap {
|
|
212
216
|
}
|
213
217
|
|
214
218
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
219
|
+
if (prefetch) {
|
220
|
+
// Advise the kernel to preload the mapped memory
|
221
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
222
|
+
range.VirtualAddress = addr;
|
223
|
+
range.NumberOfBytes = (SIZE_T)size;
|
224
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
225
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
226
|
+
llama_format_win_err(GetLastError()).c_str());
|
227
|
+
}
|
222
228
|
}
|
223
229
|
#else
|
224
230
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-12b5900'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
|
|
17
17
|
# @param n_threads [Integer]
|
18
18
|
# @return [String]
|
19
19
|
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
20
|
+
spaced_prompt = " #{prompt}"
|
21
21
|
|
22
|
-
embd_input = context.tokenize(text:
|
22
|
+
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
23
|
|
24
24
|
n_ctx = context.n_ctx
|
25
25
|
last_n_tokens = [0] * n_ctx
|
@@ -71,6 +71,6 @@ module LLaMACpp
|
|
71
71
|
break if embd[-1] == LLaMACpp.token_eos
|
72
72
|
end
|
73
73
|
|
74
|
-
output.join.delete_prefix(
|
74
|
+
output.join.delete_prefix(spaced_prompt).strip
|
75
75
|
end
|
76
76
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,10 +5,21 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
|
16
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
8
17
|
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
9
18
|
def self?.print_system_info: () -> void
|
10
19
|
def self?.token_bos: () -> Integer
|
11
20
|
def self?.token_eos: () -> Integer
|
21
|
+
def self?.mmap_supported?: () -> bool
|
22
|
+
def self?.mlock_supported?: () -> bool
|
12
23
|
|
13
24
|
class Context
|
14
25
|
public
|
@@ -16,7 +27,7 @@ module LLaMACpp
|
|
16
27
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
17
28
|
| () -> void
|
18
29
|
def embeddings: () -> Array[Float]
|
19
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) ->
|
30
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
20
31
|
def free: () -> void
|
21
32
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
22
33
|
def logits: () -> Array[Float]
|
@@ -28,6 +39,7 @@ module LLaMACpp
|
|
28
39
|
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
29
40
|
def token_to_str: (Integer) -> String
|
30
41
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
42
|
+
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
31
43
|
end
|
32
44
|
|
33
45
|
class ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
29
30
|
- ext/llama_cpp/src/ggml.c
|
30
31
|
- ext/llama_cpp/src/ggml.h
|
31
32
|
- ext/llama_cpp/src/llama.cpp
|