llama_cpp 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,6 +55,7 @@ extern "C" {
55
55
  bool f16_kv; // use fp16 for KV cache
56
56
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
57
57
  bool vocab_only; // only load the vocabulary, no weights
58
+ bool use_mmap; // use mmap if possible
58
59
  bool use_mlock; // force system to keep model in RAM
59
60
  bool embedding; // embedding mode only
60
61
 
@@ -64,8 +65,20 @@ extern "C" {
64
65
  void * progress_callback_user_data;
65
66
  };
66
67
 
68
+ // model file types
69
+ enum llama_ftype {
70
+ LLAMA_FTYPE_ALL_F32 = 0,
71
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
72
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ };
76
+
67
77
  LLAMA_API struct llama_context_params llama_context_default_params();
68
78
 
79
+ LLAMA_API bool llama_mmap_supported();
80
+ LLAMA_API bool llama_mlock_supported();
81
+
69
82
  // Various functions for loading a ggml llama model.
70
83
  // Allocate (almost) all memory needed for the model.
71
84
  // Return NULL on failure
@@ -81,7 +94,19 @@ extern "C" {
81
94
  LLAMA_API int llama_model_quantize(
82
95
  const char * fname_inp,
83
96
  const char * fname_out,
84
- int itype);
97
+ enum llama_ftype ftype);
98
+
99
+ // Apply a LoRA adapter to a loaded model
100
+ // path_base_model is the path to a higher quality model to use as a base for
101
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
102
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
103
+ // will be applied on top of the previous one
104
+ // Returns 0 on success
105
+ LLAMA_API int llama_apply_lora_from_file(
106
+ struct llama_context * ctx,
107
+ const char * path_lora,
108
+ const char * path_base_model,
109
+ int n_threads);
85
110
 
86
111
  // Returns the KV cache that will contain the context for the
87
112
  // ongoing prediction with the model.
@@ -166,4 +191,15 @@ extern "C" {
166
191
  }
167
192
  #endif
168
193
 
194
+ // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
195
+ #ifdef LLAMA_API_INTERNAL
196
+
197
+ #include <vector>
198
+ #include <string>
199
+ struct ggml_tensor;
200
+
201
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
202
+
169
203
  #endif
204
+
205
+ #endif // LLAMA_H
@@ -0,0 +1,396 @@
1
+ // Internal header to be included only by llama.cpp.
2
+ // Contains wrappers around OS interfaces.
3
+
4
+ #ifndef LLAMA_UTIL_H
5
+ #define LLAMA_UTIL_H
6
+
7
+ #include <cstdio>
8
+ #include <cstdint>
9
+ #include <cerrno>
10
+ #include <cstring>
11
+ #include <cstdarg>
12
+ #include <cstdlib>
13
+ #include <climits>
14
+
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #ifdef __has_include
19
+ #if __has_include(<unistd.h>)
20
+ #include <unistd.h>
21
+ #if defined(_POSIX_MAPPED_FILES)
22
+ #include <sys/mman.h>
23
+ #endif
24
+ #endif
25
+ #endif
26
+
27
+ #if defined(_WIN32)
28
+ #define WIN32_LEAN_AND_MEAN
29
+ #ifndef NOMINMAX
30
+ #define NOMINMAX
31
+ #endif
32
+ #include <windows.h>
33
+ #include <io.h>
34
+ #include <stdio.h> // for _fseeki64
35
+ #endif
36
+
37
+ #define LLAMA_ASSERT(x) \
38
+ do { \
39
+ if (!(x)) { \
40
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
41
+ abort(); \
42
+ } \
43
+ } while (0)
44
+
45
+ #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
49
+ __attribute__((format(printf, 1, 2)))
50
+ #endif
51
+ #endif
52
+ static std::string format(const char * fmt, ...) {
53
+ va_list ap, ap2;
54
+ va_start(ap, fmt);
55
+ va_copy(ap2, ap);
56
+ int size = vsnprintf(NULL, 0, fmt, ap);
57
+ LLAMA_ASSERT(size >= 0 && size < INT_MAX);
58
+ std::vector<char> buf(size + 1);
59
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
60
+ LLAMA_ASSERT(size2 == size);
61
+ va_end(ap2);
62
+ va_end(ap);
63
+ return std::string(buf.data(), size);
64
+ }
65
+
66
+ struct llama_file {
67
+ // use FILE * so we don't have to re-open the file to mmap
68
+ FILE * fp;
69
+ size_t size;
70
+
71
+ llama_file(const char * fname, const char * mode) {
72
+ fp = std::fopen(fname, mode);
73
+ if (fp == NULL) {
74
+ throw format("failed to open %s: %s", fname, std::strerror(errno));
75
+ }
76
+ seek(0, SEEK_END);
77
+ size = tell();
78
+ seek(0, SEEK_SET);
79
+ }
80
+
81
+ size_t tell() const {
82
+ #ifdef _WIN32
83
+ __int64 ret = _ftelli64(fp);
84
+ #else
85
+ long ret = std::ftell(fp);
86
+ #endif
87
+ LLAMA_ASSERT(ret != -1); // this really shouldn't fail
88
+ return (size_t) ret;
89
+ }
90
+
91
+ void seek(size_t offset, int whence) {
92
+ #ifdef _WIN32
93
+ int ret = _fseeki64(fp, (__int64) offset, whence);
94
+ #else
95
+ int ret = std::fseek(fp, (long) offset, whence);
96
+ #endif
97
+ LLAMA_ASSERT(ret == 0); // same
98
+ }
99
+
100
+ void read_raw(void * ptr, size_t size) {
101
+ if (size == 0) {
102
+ return;
103
+ }
104
+ errno = 0;
105
+ std::size_t ret = std::fread(ptr, size, 1, fp);
106
+ if (ferror(fp)) {
107
+ throw format("read error: %s", strerror(errno));
108
+ }
109
+ if (ret != 1) {
110
+ throw std::string("unexpectedly reached end of file");
111
+ }
112
+ }
113
+
114
+ std::uint32_t read_u32() {
115
+ std::uint32_t ret;
116
+ read_raw(&ret, sizeof(ret));
117
+ return ret;
118
+ }
119
+
120
+ std::string read_string(std::uint32_t len) {
121
+ std::vector<char> chars(len);
122
+ read_raw(chars.data(), len);
123
+ return std::string(chars.data(), len);
124
+ }
125
+
126
+ void write_raw(const void * ptr, size_t size) {
127
+ if (size == 0) {
128
+ return;
129
+ }
130
+ errno = 0;
131
+ size_t ret = std::fwrite(ptr, size, 1, fp);
132
+ if (ret != 1) {
133
+ throw format("write error: %s", strerror(errno));
134
+ }
135
+ }
136
+
137
+ void write_u32(std::uint32_t val) {
138
+ write_raw(&val, sizeof(val));
139
+ }
140
+
141
+ ~llama_file() {
142
+ if (fp) {
143
+ std::fclose(fp);
144
+ }
145
+ }
146
+ };
147
+
148
+ #if defined(_WIN32)
149
+ static std::string llama_format_win_err(DWORD err) {
150
+ LPSTR buf;
151
+ size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
152
+ NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
153
+ if (!size) {
154
+ return "FormatMessageA failed";
155
+ }
156
+ std::string ret(buf, size);
157
+ LocalFree(buf);
158
+ return ret;
159
+ }
160
+ #endif
161
+
162
+ struct llama_mmap {
163
+ void * addr;
164
+ size_t size;
165
+
166
+ llama_mmap(const llama_mmap &) = delete;
167
+
168
+ #ifdef _POSIX_MAPPED_FILES
169
+ static constexpr bool SUPPORTED = true;
170
+
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
172
+ size = file->size;
173
+ int fd = fileno(file->fp);
174
+ int flags = MAP_SHARED;
175
+ #ifdef __linux__
176
+ flags |= MAP_POPULATE;
177
+ #endif
178
+ addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
179
+ if (addr == MAP_FAILED) {
180
+ throw format("mmap failed: %s", strerror(errno));
181
+ }
182
+
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
189
+ }
190
+ }
191
+
192
+ ~llama_mmap() {
193
+ munmap(addr, size);
194
+ }
195
+ #elif defined(_WIN32)
196
+ static constexpr bool SUPPORTED = true;
197
+
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
199
+ size = file->size;
200
+
201
+ HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
202
+
203
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
+ DWORD error = GetLastError();
205
+ CloseHandle(hFile);
206
+
207
+ if (hMapping == NULL) {
208
+ throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
209
+ }
210
+
211
+ addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
212
+ error = GetLastError();
213
+ CloseHandle(hMapping);
214
+
215
+ if (addr == NULL) {
216
+ throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
217
+ }
218
+
219
+ #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
220
+ if (prefetch) {
221
+ // Advise the kernel to preload the mapped memory
222
+ WIN32_MEMORY_RANGE_ENTRY range;
223
+ range.VirtualAddress = addr;
224
+ range.NumberOfBytes = (SIZE_T)size;
225
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
226
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
227
+ llama_format_win_err(GetLastError()).c_str());
228
+ }
229
+ }
230
+ #else
231
+ #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
232
+ #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
233
+ }
234
+
235
+ ~llama_mmap() {
236
+ if (!UnmapViewOfFile(addr)) {
237
+ fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
238
+ llama_format_win_err(GetLastError()).c_str());
239
+ }
240
+ }
241
+ #else
242
+ static constexpr bool SUPPORTED = false;
243
+
244
+ llama_mmap(struct llama_file *) {
245
+ throw std::string("mmap not supported");
246
+ }
247
+ #endif
248
+ };
249
+
250
+ // Represents some region of memory being locked using mlock or VirtualLock;
251
+ // will automatically unlock on destruction.
252
+ struct llama_mlock {
253
+ void * addr = NULL;
254
+ size_t size = 0;
255
+ bool failed_already = false;
256
+
257
+ llama_mlock() {}
258
+ llama_mlock(const llama_mlock &) = delete;
259
+
260
+ ~llama_mlock() {
261
+ if (size) {
262
+ raw_unlock(addr, size);
263
+ }
264
+ }
265
+
266
+ void init(void * addr) {
267
+ LLAMA_ASSERT(this->addr == NULL && this->size == 0);
268
+ this->addr = addr;
269
+ }
270
+
271
+ void grow_to(size_t target_size) {
272
+ LLAMA_ASSERT(addr);
273
+ if (failed_already) {
274
+ return;
275
+ }
276
+ size_t granularity = lock_granularity();
277
+ target_size = (target_size + granularity - 1) & ~(granularity - 1);
278
+ if (target_size > size) {
279
+ if (raw_lock((uint8_t *) addr + size, target_size - size)) {
280
+ size = target_size;
281
+ } else {
282
+ failed_already = true;
283
+ }
284
+ }
285
+ }
286
+
287
+ #ifdef _POSIX_MEMLOCK_RANGE
288
+ static constexpr bool SUPPORTED = true;
289
+
290
+ size_t lock_granularity() {
291
+ return (size_t) sysconf(_SC_PAGESIZE);
292
+ }
293
+
294
+ #ifdef __APPLE__
295
+ #define MLOCK_SUGGESTION \
296
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
297
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
298
+ #else
299
+ #define MLOCK_SUGGESTION \
300
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
301
+ #endif
302
+
303
+ bool raw_lock(const void * addr, size_t size) {
304
+ if (!mlock(addr, size)) {
305
+ return true;
306
+ } else {
307
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
308
+ size, this->size, std::strerror(errno));
309
+ return false;
310
+ }
311
+ }
312
+
313
+ #undef MLOCK_SUGGESTION
314
+
315
+ void raw_unlock(void * addr, size_t size) {
316
+ if (munlock(addr, size)) {
317
+ fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
318
+ }
319
+ }
320
+ #elif defined(_WIN32)
321
+ static constexpr bool SUPPORTED = true;
322
+
323
+ size_t lock_granularity() {
324
+ SYSTEM_INFO si;
325
+ GetSystemInfo(&si);
326
+ return (size_t) si.dwPageSize;
327
+ }
328
+
329
+ bool raw_lock(void * addr, size_t size) {
330
+ for (int tries = 1; ; tries++) {
331
+ if (VirtualLock(addr, size)) {
332
+ return true;
333
+ }
334
+ if (tries == 2) {
335
+ fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
336
+ size, this->size, llama_format_win_err(GetLastError()).c_str());
337
+ return false;
338
+ }
339
+
340
+ // It failed but this was only the first try; increase the working
341
+ // set size and try again.
342
+ SIZE_T min_ws_size, max_ws_size;
343
+ if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
344
+ fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
345
+ llama_format_win_err(GetLastError()).c_str());
346
+ return false;
347
+ }
348
+ // Per MSDN: "The maximum number of pages that a process can lock
349
+ // is equal to the number of pages in its minimum working set minus
350
+ // a small overhead."
351
+ // Hopefully a megabyte is enough overhead:
352
+ size_t increment = size + 1048576;
353
+ // The minimum must be <= the maximum, so we need to increase both:
354
+ min_ws_size += increment;
355
+ max_ws_size += increment;
356
+ if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
357
+ fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
358
+ llama_format_win_err(GetLastError()).c_str());
359
+ return false;
360
+ }
361
+ }
362
+ }
363
+
364
+ void raw_unlock(void * addr, size_t size) {
365
+ if (!VirtualUnlock(addr, size)) {
366
+ fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
367
+ llama_format_win_err(GetLastError()).c_str());
368
+ }
369
+ }
370
+ #else
371
+ static constexpr bool SUPPORTED = false;
372
+
373
+ void raw_lock(const void * addr, size_t size) {
374
+ fprintf(stderr, "warning: mlock not supported on this system\n");
375
+ }
376
+
377
+ void raw_unlock(const void * addr, size_t size) {}
378
+ #endif
379
+ };
380
+
381
+ // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
382
+ struct llama_buffer {
383
+ uint8_t * addr = NULL;
384
+ size_t size = 0;
385
+
386
+ void resize(size_t size) {
387
+ delete[] addr;
388
+ addr = new uint8_t[size];
389
+ this->size = size;
390
+ }
391
+
392
+ ~llama_buffer() {
393
+ delete[] addr;
394
+ }
395
+ };
396
+ #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.3'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-698f7b5'
9
+ LLAMA_CPP_VERSION = 'master-315a95a'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -9,13 +9,18 @@ module LLaMACpp
9
9
  def self?.print_system_info: () -> void
10
10
  def self?.token_bos: () -> Integer
11
11
  def self?.token_eos: () -> Integer
12
+ def self?.mmap_supported?: () -> bool
13
+ def self?.mlock_supported?: () -> bool
12
14
 
13
15
  class Context
14
16
  public
15
17
 
16
18
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
+ | () -> void
17
20
  def embeddings: () -> Array[Float]
18
21
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
22
+ def free: () -> void
23
+ def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
24
  def logits: () -> Array[Float]
20
25
  def n_ctx: () -> Integer
21
26
  def n_embd: () -> Integer
@@ -25,6 +30,7 @@ module LLaMACpp
25
30
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
26
31
  def token_to_str: (Integer) -> String
27
32
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
33
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
28
34
  end
29
35
 
30
36
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-08 00:00:00.000000000 Z
11
+ date: 2023-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - ext/llama_cpp/src/ggml.h
31
31
  - ext/llama_cpp/src/llama.cpp
32
32
  - ext/llama_cpp/src/llama.h
33
+ - ext/llama_cpp/src/llama_util.h
33
34
  - lib/llama_cpp.rb
34
35
  - lib/llama_cpp/version.rb
35
36
  - sig/llama_cpp.rbs