llama_cpp 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -55,6 +55,7 @@ extern "C" {
55
55
  bool f16_kv; // use fp16 for KV cache
56
56
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
57
57
  bool vocab_only; // only load the vocabulary, no weights
58
+ bool use_mmap; // use mmap if possible
58
59
  bool use_mlock; // force system to keep model in RAM
59
60
  bool embedding; // embedding mode only
60
61
 
@@ -64,8 +65,20 @@ extern "C" {
64
65
  void * progress_callback_user_data;
65
66
  };
66
67
 
68
+ // model file types
69
+ enum llama_ftype {
70
+ LLAMA_FTYPE_ALL_F32 = 0,
71
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
72
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ };
76
+
67
77
  LLAMA_API struct llama_context_params llama_context_default_params();
68
78
 
79
+ LLAMA_API bool llama_mmap_supported();
80
+ LLAMA_API bool llama_mlock_supported();
81
+
69
82
  // Various functions for loading a ggml llama model.
70
83
  // Allocate (almost) all memory needed for the model.
71
84
  // Return NULL on failure
@@ -81,7 +94,7 @@ extern "C" {
81
94
  LLAMA_API int llama_model_quantize(
82
95
  const char * fname_inp,
83
96
  const char * fname_out,
84
- int itype);
97
+ enum llama_ftype ftype);
85
98
 
86
99
  // Returns the KV cache that will contain the context for the
87
100
  // ongoing prediction with the model.
@@ -166,4 +179,15 @@ extern "C" {
166
179
  }
167
180
  #endif
168
181
 
182
+ // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
183
+ #ifdef LLAMA_API_INTERNAL
184
+
185
+ #include <vector>
186
+ #include <string>
187
+ struct ggml_tensor;
188
+
189
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
190
+
169
191
  #endif
192
+
193
+ #endif // LLAMA_H
@@ -0,0 +1,389 @@
1
+ // Internal header to be included only by llama.cpp.
2
+ // Contains wrappers around OS interfaces.
3
+
4
+ #ifndef LLAMA_UTIL_H
5
+ #define LLAMA_UTIL_H
6
+
7
+ #include <cstdio>
8
+ #include <cstdint>
9
+ #include <cerrno>
10
+ #include <cstring>
11
+ #include <cstdarg>
12
+ #include <cstdlib>
13
+ #include <climits>
14
+
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #ifdef __has_include
19
+ #if __has_include(<unistd.h>)
20
+ #include <unistd.h>
21
+ #if defined(_POSIX_MAPPED_FILES)
22
+ #include <sys/mman.h>
23
+ #endif
24
+ #endif
25
+ #endif
26
+
27
+ #if defined(_WIN32)
28
+ #define WIN32_LEAN_AND_MEAN
29
+ #ifndef NOMINMAX
30
+ #define NOMINMAX
31
+ #endif
32
+ #include <windows.h>
33
+ #include <io.h>
34
+ #include <stdio.h> // for _fseeki64
35
+ #endif
36
+
37
+ #define LLAMA_ASSERT(x) \
38
+ do { \
39
+ if (!(x)) { \
40
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
41
+ abort(); \
42
+ } \
43
+ } while (0)
44
+
45
+ #ifdef __GNUC__
46
+ __attribute__((format(printf, 1, 2)))
47
+ #endif
48
+ static std::string format(const char * fmt, ...) {
49
+ va_list ap, ap2;
50
+ va_start(ap, fmt);
51
+ va_copy(ap2, ap);
52
+ int size = vsnprintf(NULL, 0, fmt, ap);
53
+ LLAMA_ASSERT(size >= 0 && size < INT_MAX);
54
+ std::vector<char> buf(size + 1);
55
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
56
+ LLAMA_ASSERT(size2 == size);
57
+ va_end(ap2);
58
+ va_end(ap);
59
+ return std::string(buf.data(), size);
60
+ };
61
+
62
+ struct llama_file {
63
+ // use FILE * so we don't have to re-open the file to mmap
64
+ FILE * fp;
65
+ size_t size;
66
+
67
+ llama_file(const char * fname, const char * mode) {
68
+ fp = std::fopen(fname, mode);
69
+ if (fp == NULL) {
70
+ throw format("failed to open %s: %s", fname, std::strerror(errno));
71
+ }
72
+ seek(0, SEEK_END);
73
+ size = tell();
74
+ seek(0, SEEK_SET);
75
+ }
76
+
77
+ size_t tell() const {
78
+ #ifdef _WIN32
79
+ __int64 ret = _ftelli64(fp);
80
+ #else
81
+ long ret = std::ftell(fp);
82
+ #endif
83
+ LLAMA_ASSERT(ret != -1); // this really shouldn't fail
84
+ return (size_t) ret;
85
+ }
86
+
87
+ void seek(size_t offset, int whence) {
88
+ #ifdef _WIN32
89
+ int ret = _fseeki64(fp, (__int64) offset, whence);
90
+ #else
91
+ int ret = std::fseek(fp, (long) offset, whence);
92
+ #endif
93
+ LLAMA_ASSERT(ret == 0); // same
94
+ }
95
+
96
+ void read_raw(void * ptr, size_t size) {
97
+ if (size == 0) {
98
+ return;
99
+ }
100
+ errno = 0;
101
+ std::size_t ret = std::fread(ptr, size, 1, fp);
102
+ if (ferror(fp)) {
103
+ throw format("read error: %s", strerror(errno));
104
+ }
105
+ if (ret != 1) {
106
+ throw std::string("unexpectedly reached end of file");
107
+ }
108
+ }
109
+
110
+ std::uint32_t read_u32() {
111
+ std::uint32_t ret;
112
+ read_raw(&ret, sizeof(ret));
113
+ return ret;
114
+ }
115
+
116
+ std::string read_string(std::uint32_t len) {
117
+ std::vector<char> chars(len);
118
+ read_raw(chars.data(), len);
119
+ return std::string(chars.data(), len);
120
+ }
121
+
122
+ void write_raw(const void * ptr, size_t size) {
123
+ if (size == 0) {
124
+ return;
125
+ }
126
+ errno = 0;
127
+ size_t ret = std::fwrite(ptr, size, 1, fp);
128
+ if (ret != 1) {
129
+ throw format("write error: %s", strerror(errno));
130
+ }
131
+ }
132
+
133
+ void write_u32(std::uint32_t val) {
134
+ write_raw(&val, sizeof(val));
135
+ }
136
+
137
+ ~llama_file() {
138
+ if (fp) {
139
+ std::fclose(fp);
140
+ }
141
+ }
142
+ };
143
+
144
+ #if defined(_WIN32)
145
+ static std::string llama_format_win_err(DWORD err) {
146
+ LPSTR buf;
147
+ size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
148
+ NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
149
+ if (!size) {
150
+ return "FormatMessageA failed";
151
+ }
152
+ std::string ret(buf, size);
153
+ LocalFree(buf);
154
+ return ret;
155
+ }
156
+ #endif
157
+
158
+ struct llama_mmap {
159
+ void * addr;
160
+ size_t size;
161
+
162
+ llama_mmap(const llama_mmap &) = delete;
163
+
164
+ #ifdef _POSIX_MAPPED_FILES
165
+ static constexpr bool SUPPORTED = true;
166
+
167
+ llama_mmap(struct llama_file * file) {
168
+ size = file->size;
169
+ int fd = fileno(file->fp);
170
+ int flags = MAP_SHARED;
171
+ #ifdef __linux__
172
+ flags |= MAP_POPULATE;
173
+ #endif
174
+ addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
+ close(fd);
176
+ if (addr == MAP_FAILED) {
177
+ throw format("mmap failed: %s", strerror(errno));
178
+ }
179
+
180
+ // Advise the kernel to preload the mapped memory
181
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
182
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
+ strerror(errno));
184
+ }
185
+ }
186
+
187
+ ~llama_mmap() {
188
+ munmap(addr, size);
189
+ }
190
+ #elif defined(_WIN32)
191
+ static constexpr bool SUPPORTED = true;
192
+
193
+ llama_mmap(struct llama_file * file) {
194
+ size = file->size;
195
+
196
+ HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
197
+
198
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
199
+ DWORD error = GetLastError();
200
+ CloseHandle(hFile);
201
+
202
+ if (hMapping == NULL) {
203
+ throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
204
+ }
205
+
206
+ addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
207
+ error = GetLastError();
208
+ CloseHandle(hMapping);
209
+
210
+ if (addr == NULL) {
211
+ throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
212
+ }
213
+
214
+ #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
+ // Advise the kernel to preload the mapped memory
216
+ WIN32_MEMORY_RANGE_ENTRY range;
217
+ range.VirtualAddress = addr;
218
+ range.NumberOfBytes = (SIZE_T)size;
219
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
+ llama_format_win_err(GetLastError()).c_str());
222
+ }
223
+ #else
224
+ #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
225
+ #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
226
+ }
227
+
228
+ ~llama_mmap() {
229
+ if (!UnmapViewOfFile(addr)) {
230
+ fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
231
+ llama_format_win_err(GetLastError()).c_str());
232
+ }
233
+ }
234
+ #else
235
+ static constexpr bool SUPPORTED = false;
236
+
237
+ llama_mmap(struct llama_file *) {
238
+ throw std::string("mmap not supported");
239
+ }
240
+ #endif
241
+ };
242
+
243
+ // Represents some region of memory being locked using mlock or VirtualLock;
244
+ // will automatically unlock on destruction.
245
+ struct llama_mlock {
246
+ void * addr = NULL;
247
+ size_t size = 0;
248
+ bool failed_already = false;
249
+
250
+ llama_mlock() {}
251
+ llama_mlock(const llama_mlock &) = delete;
252
+
253
+ ~llama_mlock() {
254
+ if (size) {
255
+ raw_unlock(addr, size);
256
+ }
257
+ }
258
+
259
+ void init(void * addr) {
260
+ LLAMA_ASSERT(this->addr == NULL && this->size == 0);
261
+ this->addr = addr;
262
+ }
263
+
264
+ void grow_to(size_t target_size) {
265
+ LLAMA_ASSERT(addr);
266
+ if (failed_already) {
267
+ return;
268
+ }
269
+ size_t granularity = lock_granularity();
270
+ target_size = (target_size + granularity - 1) & ~(granularity - 1);
271
+ if (target_size > size) {
272
+ if (raw_lock((uint8_t *) addr + size, target_size - size)) {
273
+ size = target_size;
274
+ } else {
275
+ failed_already = true;
276
+ }
277
+ }
278
+ }
279
+
280
+ #ifdef _POSIX_MEMLOCK_RANGE
281
+ static constexpr bool SUPPORTED = true;
282
+
283
+ size_t lock_granularity() {
284
+ return (size_t) sysconf(_SC_PAGESIZE);
285
+ }
286
+
287
+ #ifdef __APPLE__
288
+ #define MLOCK_SUGGESTION \
289
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
290
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
291
+ #else
292
+ #define MLOCK_SUGGESTION \
293
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
294
+ #endif
295
+
296
+ bool raw_lock(const void * addr, size_t size) {
297
+ if (!mlock(addr, size)) {
298
+ return true;
299
+ } else {
300
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
301
+ size, this->size, std::strerror(errno));
302
+ return false;
303
+ }
304
+ }
305
+
306
+ #undef MLOCK_SUGGESTION
307
+
308
+ void raw_unlock(void * addr, size_t size) {
309
+ if (munlock(addr, size)) {
310
+ fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
311
+ }
312
+ }
313
+ #elif defined(_WIN32)
314
+ static constexpr bool SUPPORTED = true;
315
+
316
+ size_t lock_granularity() {
317
+ SYSTEM_INFO si;
318
+ GetSystemInfo(&si);
319
+ return (size_t) si.dwPageSize;
320
+ }
321
+
322
+ bool raw_lock(void * addr, size_t size) {
323
+ for (int tries = 1; ; tries++) {
324
+ if (VirtualLock(addr, size)) {
325
+ return true;
326
+ }
327
+ if (tries == 2) {
328
+ fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
329
+ size, this->size, llama_format_win_err(GetLastError()).c_str());
330
+ return false;
331
+ }
332
+
333
+ // It failed but this was only the first try; increase the working
334
+ // set size and try again.
335
+ SIZE_T min_ws_size, max_ws_size;
336
+ if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
337
+ fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
338
+ llama_format_win_err(GetLastError()).c_str());
339
+ return false;
340
+ }
341
+ // Per MSDN: "The maximum number of pages that a process can lock
342
+ // is equal to the number of pages in its minimum working set minus
343
+ // a small overhead."
344
+ // Hopefully a megabyte is enough overhead:
345
+ size_t increment = size + 1048576;
346
+ // The minimum must be <= the maximum, so we need to increase both:
347
+ min_ws_size += increment;
348
+ max_ws_size += increment;
349
+ if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
350
+ fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
351
+ llama_format_win_err(GetLastError()).c_str());
352
+ return false;
353
+ }
354
+ }
355
+ }
356
+
357
+ void raw_unlock(void * addr, size_t size) {
358
+ if (!VirtualUnlock(addr, size)) {
359
+ fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
360
+ llama_format_win_err(GetLastError()).c_str());
361
+ }
362
+ }
363
+ #else
364
+ static constexpr bool SUPPORTED = false;
365
+
366
+ void raw_lock(const void * addr, size_t size) {
367
+ fprintf(stderr, "warning: mlock not supported on this system\n");
368
+ }
369
+
370
+ void raw_unlock(const void * addr, size_t size) {}
371
+ #endif
372
+ };
373
+
374
+ // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
375
+ struct llama_buffer {
376
+ uint8_t * addr = NULL;
377
+ size_t size = 0;
378
+
379
+ void resize(size_t size) {
380
+ delete[] addr;
381
+ addr = new uint8_t[size];
382
+ this->size = size;
383
+ }
384
+
385
+ ~llama_buffer() {
386
+ delete[] addr;
387
+ }
388
+ };
389
+ #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.3'
6
+ VERSION = '0.0.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-698f7b5'
9
+ LLAMA_CPP_VERSION = 'master-c85e03d'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -14,8 +14,11 @@ module LLaMACpp
14
14
  public
15
15
 
16
16
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
+ | () -> void
17
18
  def embeddings: () -> Array[Float]
18
19
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
20
+ def free: () -> void
21
+ def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
22
  def logits: () -> Array[Float]
20
23
  def n_ctx: () -> Integer
21
24
  def n_embd: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-08 00:00:00.000000000 Z
11
+ date: 2023-04-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - ext/llama_cpp/src/ggml.h
31
31
  - ext/llama_cpp/src/llama.cpp
32
32
  - ext/llama_cpp/src/llama.h
33
+ - ext/llama_cpp/src/llama_util.h
33
34
  - lib/llama_cpp.rb
34
35
  - lib/llama_cpp/version.rb
35
36
  - sig/llama_cpp.rbs