llama_cpp 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -55,6 +55,7 @@ extern "C" {
55
55
  bool f16_kv; // use fp16 for KV cache
56
56
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
57
57
  bool vocab_only; // only load the vocabulary, no weights
58
+ bool use_mmap; // use mmap if possible
58
59
  bool use_mlock; // force system to keep model in RAM
59
60
  bool embedding; // embedding mode only
60
61
 
@@ -64,8 +65,20 @@ extern "C" {
64
65
  void * progress_callback_user_data;
65
66
  };
66
67
 
68
+ // model file types
69
+ enum llama_ftype {
70
+ LLAMA_FTYPE_ALL_F32 = 0,
71
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
72
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ };
76
+
67
77
  LLAMA_API struct llama_context_params llama_context_default_params();
68
78
 
79
+ LLAMA_API bool llama_mmap_supported();
80
+ LLAMA_API bool llama_mlock_supported();
81
+
69
82
  // Various functions for loading a ggml llama model.
70
83
  // Allocate (almost) all memory needed for the model.
71
84
  // Return NULL on failure
@@ -81,7 +94,19 @@ extern "C" {
81
94
  LLAMA_API int llama_model_quantize(
82
95
  const char * fname_inp,
83
96
  const char * fname_out,
84
- int itype);
97
+ enum llama_ftype ftype);
98
+
99
+ // Apply a LoRA adapter to a loaded model
100
+ // path_base_model is the path to a higher quality model to use as a base for
101
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
102
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
103
+ // will be applied on top of the previous one
104
+ // Returns 0 on success
105
+ LLAMA_API int llama_apply_lora_from_file(
106
+ struct llama_context * ctx,
107
+ const char * path_lora,
108
+ const char * path_base_model,
109
+ int n_threads);
85
110
 
86
111
  // Returns the KV cache that will contain the context for the
87
112
  // ongoing prediction with the model.
@@ -166,4 +191,15 @@ extern "C" {
166
191
  }
167
192
  #endif
168
193
 
194
+ // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
195
+ #ifdef LLAMA_API_INTERNAL
196
+
197
+ #include <vector>
198
+ #include <string>
199
+ struct ggml_tensor;
200
+
201
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
202
+
169
203
  #endif
204
+
205
+ #endif // LLAMA_H
@@ -0,0 +1,396 @@
1
+ // Internal header to be included only by llama.cpp.
2
+ // Contains wrappers around OS interfaces.
3
+
4
+ #ifndef LLAMA_UTIL_H
5
+ #define LLAMA_UTIL_H
6
+
7
+ #include <cstdio>
8
+ #include <cstdint>
9
+ #include <cerrno>
10
+ #include <cstring>
11
+ #include <cstdarg>
12
+ #include <cstdlib>
13
+ #include <climits>
14
+
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #ifdef __has_include
19
+ #if __has_include(<unistd.h>)
20
+ #include <unistd.h>
21
+ #if defined(_POSIX_MAPPED_FILES)
22
+ #include <sys/mman.h>
23
+ #endif
24
+ #endif
25
+ #endif
26
+
27
+ #if defined(_WIN32)
28
+ #define WIN32_LEAN_AND_MEAN
29
+ #ifndef NOMINMAX
30
+ #define NOMINMAX
31
+ #endif
32
+ #include <windows.h>
33
+ #include <io.h>
34
+ #include <stdio.h> // for _fseeki64
35
+ #endif
36
+
37
+ #define LLAMA_ASSERT(x) \
38
+ do { \
39
+ if (!(x)) { \
40
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
41
+ abort(); \
42
+ } \
43
+ } while (0)
44
+
45
+ #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
49
+ __attribute__((format(printf, 1, 2)))
50
+ #endif
51
+ #endif
52
+ static std::string format(const char * fmt, ...) {
53
+ va_list ap, ap2;
54
+ va_start(ap, fmt);
55
+ va_copy(ap2, ap);
56
+ int size = vsnprintf(NULL, 0, fmt, ap);
57
+ LLAMA_ASSERT(size >= 0 && size < INT_MAX);
58
+ std::vector<char> buf(size + 1);
59
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
60
+ LLAMA_ASSERT(size2 == size);
61
+ va_end(ap2);
62
+ va_end(ap);
63
+ return std::string(buf.data(), size);
64
+ }
65
+
66
+ struct llama_file {
67
+ // use FILE * so we don't have to re-open the file to mmap
68
+ FILE * fp;
69
+ size_t size;
70
+
71
+ llama_file(const char * fname, const char * mode) {
72
+ fp = std::fopen(fname, mode);
73
+ if (fp == NULL) {
74
+ throw format("failed to open %s: %s", fname, std::strerror(errno));
75
+ }
76
+ seek(0, SEEK_END);
77
+ size = tell();
78
+ seek(0, SEEK_SET);
79
+ }
80
+
81
+ size_t tell() const {
82
+ #ifdef _WIN32
83
+ __int64 ret = _ftelli64(fp);
84
+ #else
85
+ long ret = std::ftell(fp);
86
+ #endif
87
+ LLAMA_ASSERT(ret != -1); // this really shouldn't fail
88
+ return (size_t) ret;
89
+ }
90
+
91
+ void seek(size_t offset, int whence) {
92
+ #ifdef _WIN32
93
+ int ret = _fseeki64(fp, (__int64) offset, whence);
94
+ #else
95
+ int ret = std::fseek(fp, (long) offset, whence);
96
+ #endif
97
+ LLAMA_ASSERT(ret == 0); // same
98
+ }
99
+
100
+ void read_raw(void * ptr, size_t size) {
101
+ if (size == 0) {
102
+ return;
103
+ }
104
+ errno = 0;
105
+ std::size_t ret = std::fread(ptr, size, 1, fp);
106
+ if (ferror(fp)) {
107
+ throw format("read error: %s", strerror(errno));
108
+ }
109
+ if (ret != 1) {
110
+ throw std::string("unexpectedly reached end of file");
111
+ }
112
+ }
113
+
114
+ std::uint32_t read_u32() {
115
+ std::uint32_t ret;
116
+ read_raw(&ret, sizeof(ret));
117
+ return ret;
118
+ }
119
+
120
+ std::string read_string(std::uint32_t len) {
121
+ std::vector<char> chars(len);
122
+ read_raw(chars.data(), len);
123
+ return std::string(chars.data(), len);
124
+ }
125
+
126
+ void write_raw(const void * ptr, size_t size) {
127
+ if (size == 0) {
128
+ return;
129
+ }
130
+ errno = 0;
131
+ size_t ret = std::fwrite(ptr, size, 1, fp);
132
+ if (ret != 1) {
133
+ throw format("write error: %s", strerror(errno));
134
+ }
135
+ }
136
+
137
+ void write_u32(std::uint32_t val) {
138
+ write_raw(&val, sizeof(val));
139
+ }
140
+
141
+ ~llama_file() {
142
+ if (fp) {
143
+ std::fclose(fp);
144
+ }
145
+ }
146
+ };
147
+
148
+ #if defined(_WIN32)
149
+ static std::string llama_format_win_err(DWORD err) {
150
+ LPSTR buf;
151
+ size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
152
+ NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
153
+ if (!size) {
154
+ return "FormatMessageA failed";
155
+ }
156
+ std::string ret(buf, size);
157
+ LocalFree(buf);
158
+ return ret;
159
+ }
160
+ #endif
161
+
162
+ struct llama_mmap {
163
+ void * addr;
164
+ size_t size;
165
+
166
+ llama_mmap(const llama_mmap &) = delete;
167
+
168
+ #ifdef _POSIX_MAPPED_FILES
169
+ static constexpr bool SUPPORTED = true;
170
+
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
172
+ size = file->size;
173
+ int fd = fileno(file->fp);
174
+ int flags = MAP_SHARED;
175
+ #ifdef __linux__
176
+ flags |= MAP_POPULATE;
177
+ #endif
178
+ addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
179
+ if (addr == MAP_FAILED) {
180
+ throw format("mmap failed: %s", strerror(errno));
181
+ }
182
+
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
189
+ }
190
+ }
191
+
192
+ ~llama_mmap() {
193
+ munmap(addr, size);
194
+ }
195
+ #elif defined(_WIN32)
196
+ static constexpr bool SUPPORTED = true;
197
+
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
199
+ size = file->size;
200
+
201
+ HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
202
+
203
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
+ DWORD error = GetLastError();
205
+ CloseHandle(hFile);
206
+
207
+ if (hMapping == NULL) {
208
+ throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
209
+ }
210
+
211
+ addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
212
+ error = GetLastError();
213
+ CloseHandle(hMapping);
214
+
215
+ if (addr == NULL) {
216
+ throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
217
+ }
218
+
219
+ #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
220
+ if (prefetch) {
221
+ // Advise the kernel to preload the mapped memory
222
+ WIN32_MEMORY_RANGE_ENTRY range;
223
+ range.VirtualAddress = addr;
224
+ range.NumberOfBytes = (SIZE_T)size;
225
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
226
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
227
+ llama_format_win_err(GetLastError()).c_str());
228
+ }
229
+ }
230
+ #else
231
+ #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
232
+ #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
233
+ }
234
+
235
+ ~llama_mmap() {
236
+ if (!UnmapViewOfFile(addr)) {
237
+ fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
238
+ llama_format_win_err(GetLastError()).c_str());
239
+ }
240
+ }
241
+ #else
242
+ static constexpr bool SUPPORTED = false;
243
+
244
+ llama_mmap(struct llama_file *) {
245
+ throw std::string("mmap not supported");
246
+ }
247
+ #endif
248
+ };
249
+
250
+ // Represents some region of memory being locked using mlock or VirtualLock;
251
+ // will automatically unlock on destruction.
252
+ struct llama_mlock {
253
+ void * addr = NULL;
254
+ size_t size = 0;
255
+ bool failed_already = false;
256
+
257
+ llama_mlock() {}
258
+ llama_mlock(const llama_mlock &) = delete;
259
+
260
+ ~llama_mlock() {
261
+ if (size) {
262
+ raw_unlock(addr, size);
263
+ }
264
+ }
265
+
266
+ void init(void * addr) {
267
+ LLAMA_ASSERT(this->addr == NULL && this->size == 0);
268
+ this->addr = addr;
269
+ }
270
+
271
+ void grow_to(size_t target_size) {
272
+ LLAMA_ASSERT(addr);
273
+ if (failed_already) {
274
+ return;
275
+ }
276
+ size_t granularity = lock_granularity();
277
+ target_size = (target_size + granularity - 1) & ~(granularity - 1);
278
+ if (target_size > size) {
279
+ if (raw_lock((uint8_t *) addr + size, target_size - size)) {
280
+ size = target_size;
281
+ } else {
282
+ failed_already = true;
283
+ }
284
+ }
285
+ }
286
+
287
+ #ifdef _POSIX_MEMLOCK_RANGE
288
+ static constexpr bool SUPPORTED = true;
289
+
290
+ size_t lock_granularity() {
291
+ return (size_t) sysconf(_SC_PAGESIZE);
292
+ }
293
+
294
+ #ifdef __APPLE__
295
+ #define MLOCK_SUGGESTION \
296
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
297
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
298
+ #else
299
+ #define MLOCK_SUGGESTION \
300
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
301
+ #endif
302
+
303
+ bool raw_lock(const void * addr, size_t size) {
304
+ if (!mlock(addr, size)) {
305
+ return true;
306
+ } else {
307
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
308
+ size, this->size, std::strerror(errno));
309
+ return false;
310
+ }
311
+ }
312
+
313
+ #undef MLOCK_SUGGESTION
314
+
315
+ void raw_unlock(void * addr, size_t size) {
316
+ if (munlock(addr, size)) {
317
+ fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
318
+ }
319
+ }
320
+ #elif defined(_WIN32)
321
+ static constexpr bool SUPPORTED = true;
322
+
323
+ size_t lock_granularity() {
324
+ SYSTEM_INFO si;
325
+ GetSystemInfo(&si);
326
+ return (size_t) si.dwPageSize;
327
+ }
328
+
329
+ bool raw_lock(void * addr, size_t size) {
330
+ for (int tries = 1; ; tries++) {
331
+ if (VirtualLock(addr, size)) {
332
+ return true;
333
+ }
334
+ if (tries == 2) {
335
+ fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
336
+ size, this->size, llama_format_win_err(GetLastError()).c_str());
337
+ return false;
338
+ }
339
+
340
+ // It failed but this was only the first try; increase the working
341
+ // set size and try again.
342
+ SIZE_T min_ws_size, max_ws_size;
343
+ if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
344
+ fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
345
+ llama_format_win_err(GetLastError()).c_str());
346
+ return false;
347
+ }
348
+ // Per MSDN: "The maximum number of pages that a process can lock
349
+ // is equal to the number of pages in its minimum working set minus
350
+ // a small overhead."
351
+ // Hopefully a megabyte is enough overhead:
352
+ size_t increment = size + 1048576;
353
+ // The minimum must be <= the maximum, so we need to increase both:
354
+ min_ws_size += increment;
355
+ max_ws_size += increment;
356
+ if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
357
+ fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
358
+ llama_format_win_err(GetLastError()).c_str());
359
+ return false;
360
+ }
361
+ }
362
+ }
363
+
364
+ void raw_unlock(void * addr, size_t size) {
365
+ if (!VirtualUnlock(addr, size)) {
366
+ fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
367
+ llama_format_win_err(GetLastError()).c_str());
368
+ }
369
+ }
370
+ #else
371
+ static constexpr bool SUPPORTED = false;
372
+
373
+ void raw_lock(const void * addr, size_t size) {
374
+ fprintf(stderr, "warning: mlock not supported on this system\n");
375
+ }
376
+
377
+ void raw_unlock(const void * addr, size_t size) {}
378
+ #endif
379
+ };
380
+
381
+ // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
382
+ struct llama_buffer {
383
+ uint8_t * addr = NULL;
384
+ size_t size = 0;
385
+
386
+ void resize(size_t size) {
387
+ delete[] addr;
388
+ addr = new uint8_t[size];
389
+ this->size = size;
390
+ }
391
+
392
+ ~llama_buffer() {
393
+ delete[] addr;
394
+ }
395
+ };
396
+ #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.3'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-698f7b5'
9
+ LLAMA_CPP_VERSION = 'master-315a95a'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -9,13 +9,18 @@ module LLaMACpp
9
9
  def self?.print_system_info: () -> void
10
10
  def self?.token_bos: () -> Integer
11
11
  def self?.token_eos: () -> Integer
12
+ def self?.mmap_supported?: () -> bool
13
+ def self?.mlock_supported?: () -> bool
12
14
 
13
15
  class Context
14
16
  public
15
17
 
16
18
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
+ | () -> void
17
20
  def embeddings: () -> Array[Float]
18
21
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
22
+ def free: () -> void
23
+ def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
24
  def logits: () -> Array[Float]
20
25
  def n_ctx: () -> Integer
21
26
  def n_embd: () -> Integer
@@ -25,6 +30,7 @@ module LLaMACpp
25
30
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
26
31
  def token_to_str: (Integer) -> String
27
32
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
33
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
28
34
  end
29
35
 
30
36
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-08 00:00:00.000000000 Z
11
+ date: 2023-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - ext/llama_cpp/src/ggml.h
31
31
  - ext/llama_cpp/src/llama.cpp
32
32
  - ext/llama_cpp/src/llama.h
33
+ - ext/llama_cpp/src/llama_util.h
33
34
  - lib/llama_cpp.rb
34
35
  - lib/llama_cpp/version.rb
35
36
  - sig/llama_cpp.rbs