llama_cpp 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,6 +55,7 @@ extern "C" {
55
55
  bool f16_kv; // use fp16 for KV cache
56
56
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
57
57
  bool vocab_only; // only load the vocabulary, no weights
58
+ bool use_mmap; // use mmap if possible
58
59
  bool use_mlock; // force system to keep model in RAM
59
60
  bool embedding; // embedding mode only
60
61
 
@@ -64,8 +65,20 @@ extern "C" {
64
65
  void * progress_callback_user_data;
65
66
  };
66
67
 
68
+ // model file types
69
+ enum llama_ftype {
70
+ LLAMA_FTYPE_ALL_F32 = 0,
71
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
72
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ };
76
+
67
77
  LLAMA_API struct llama_context_params llama_context_default_params();
68
78
 
79
+ LLAMA_API bool llama_mmap_supported();
80
+ LLAMA_API bool llama_mlock_supported();
81
+
69
82
  // Various functions for loading a ggml llama model.
70
83
  // Allocate (almost) all memory needed for the model.
71
84
  // Return NULL on failure
@@ -81,7 +94,7 @@ extern "C" {
81
94
  LLAMA_API int llama_model_quantize(
82
95
  const char * fname_inp,
83
96
  const char * fname_out,
84
- int itype);
97
+ enum llama_ftype ftype);
85
98
 
86
99
  // Returns the KV cache that will contain the context for the
87
100
  // ongoing prediction with the model.
@@ -166,4 +179,15 @@ extern "C" {
166
179
  }
167
180
  #endif
168
181
 
182
+ // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
183
+ #ifdef LLAMA_API_INTERNAL
184
+
185
+ #include <vector>
186
+ #include <string>
187
+ struct ggml_tensor;
188
+
189
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
190
+
169
191
  #endif
192
+
193
+ #endif // LLAMA_H
@@ -0,0 +1,389 @@
1
+ // Internal header to be included only by llama.cpp.
2
+ // Contains wrappers around OS interfaces.
3
+
4
+ #ifndef LLAMA_UTIL_H
5
+ #define LLAMA_UTIL_H
6
+
7
+ #include <cstdio>
8
+ #include <cstdint>
9
+ #include <cerrno>
10
+ #include <cstring>
11
+ #include <cstdarg>
12
+ #include <cstdlib>
13
+ #include <climits>
14
+
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #ifdef __has_include
19
+ #if __has_include(<unistd.h>)
20
+ #include <unistd.h>
21
+ #if defined(_POSIX_MAPPED_FILES)
22
+ #include <sys/mman.h>
23
+ #endif
24
+ #endif
25
+ #endif
26
+
27
+ #if defined(_WIN32)
28
+ #define WIN32_LEAN_AND_MEAN
29
+ #ifndef NOMINMAX
30
+ #define NOMINMAX
31
+ #endif
32
+ #include <windows.h>
33
+ #include <io.h>
34
+ #include <stdio.h> // for _fseeki64
35
+ #endif
36
+
37
+ #define LLAMA_ASSERT(x) \
38
+ do { \
39
+ if (!(x)) { \
40
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
41
+ abort(); \
42
+ } \
43
+ } while (0)
44
+
45
+ #ifdef __GNUC__
46
+ __attribute__((format(printf, 1, 2)))
47
+ #endif
48
+ static std::string format(const char * fmt, ...) {
49
+ va_list ap, ap2;
50
+ va_start(ap, fmt);
51
+ va_copy(ap2, ap);
52
+ int size = vsnprintf(NULL, 0, fmt, ap);
53
+ LLAMA_ASSERT(size >= 0 && size < INT_MAX);
54
+ std::vector<char> buf(size + 1);
55
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
56
+ LLAMA_ASSERT(size2 == size);
57
+ va_end(ap2);
58
+ va_end(ap);
59
+ return std::string(buf.data(), size);
60
+ };
61
+
62
+ struct llama_file {
63
+ // use FILE * so we don't have to re-open the file to mmap
64
+ FILE * fp;
65
+ size_t size;
66
+
67
+ llama_file(const char * fname, const char * mode) {
68
+ fp = std::fopen(fname, mode);
69
+ if (fp == NULL) {
70
+ throw format("failed to open %s: %s", fname, std::strerror(errno));
71
+ }
72
+ seek(0, SEEK_END);
73
+ size = tell();
74
+ seek(0, SEEK_SET);
75
+ }
76
+
77
+ size_t tell() const {
78
+ #ifdef _WIN32
79
+ __int64 ret = _ftelli64(fp);
80
+ #else
81
+ long ret = std::ftell(fp);
82
+ #endif
83
+ LLAMA_ASSERT(ret != -1); // this really shouldn't fail
84
+ return (size_t) ret;
85
+ }
86
+
87
+ void seek(size_t offset, int whence) {
88
+ #ifdef _WIN32
89
+ int ret = _fseeki64(fp, (__int64) offset, whence);
90
+ #else
91
+ int ret = std::fseek(fp, (long) offset, whence);
92
+ #endif
93
+ LLAMA_ASSERT(ret == 0); // same
94
+ }
95
+
96
+ void read_raw(void * ptr, size_t size) {
97
+ if (size == 0) {
98
+ return;
99
+ }
100
+ errno = 0;
101
+ std::size_t ret = std::fread(ptr, size, 1, fp);
102
+ if (ferror(fp)) {
103
+ throw format("read error: %s", strerror(errno));
104
+ }
105
+ if (ret != 1) {
106
+ throw std::string("unexpectedly reached end of file");
107
+ }
108
+ }
109
+
110
+ std::uint32_t read_u32() {
111
+ std::uint32_t ret;
112
+ read_raw(&ret, sizeof(ret));
113
+ return ret;
114
+ }
115
+
116
+ std::string read_string(std::uint32_t len) {
117
+ std::vector<char> chars(len);
118
+ read_raw(chars.data(), len);
119
+ return std::string(chars.data(), len);
120
+ }
121
+
122
+ void write_raw(const void * ptr, size_t size) {
123
+ if (size == 0) {
124
+ return;
125
+ }
126
+ errno = 0;
127
+ size_t ret = std::fwrite(ptr, size, 1, fp);
128
+ if (ret != 1) {
129
+ throw format("write error: %s", strerror(errno));
130
+ }
131
+ }
132
+
133
+ void write_u32(std::uint32_t val) {
134
+ write_raw(&val, sizeof(val));
135
+ }
136
+
137
+ ~llama_file() {
138
+ if (fp) {
139
+ std::fclose(fp);
140
+ }
141
+ }
142
+ };
143
+
144
+ #if defined(_WIN32)
145
+ static std::string llama_format_win_err(DWORD err) {
146
+ LPSTR buf;
147
+ size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
148
+ NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
149
+ if (!size) {
150
+ return "FormatMessageA failed";
151
+ }
152
+ std::string ret(buf, size);
153
+ LocalFree(buf);
154
+ return ret;
155
+ }
156
+ #endif
157
+
158
+ struct llama_mmap {
159
+ void * addr;
160
+ size_t size;
161
+
162
+ llama_mmap(const llama_mmap &) = delete;
163
+
164
+ #ifdef _POSIX_MAPPED_FILES
165
+ static constexpr bool SUPPORTED = true;
166
+
167
+ llama_mmap(struct llama_file * file) {
168
+ size = file->size;
169
+ int fd = fileno(file->fp);
170
+ int flags = MAP_SHARED;
171
+ #ifdef __linux__
172
+ flags |= MAP_POPULATE;
173
+ #endif
174
+ addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
+ close(fd);
176
+ if (addr == MAP_FAILED) {
177
+ throw format("mmap failed: %s", strerror(errno));
178
+ }
179
+
180
+ // Advise the kernel to preload the mapped memory
181
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
182
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
+ strerror(errno));
184
+ }
185
+ }
186
+
187
+ ~llama_mmap() {
188
+ munmap(addr, size);
189
+ }
190
+ #elif defined(_WIN32)
191
+ static constexpr bool SUPPORTED = true;
192
+
193
+ llama_mmap(struct llama_file * file) {
194
+ size = file->size;
195
+
196
+ HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
197
+
198
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
199
+ DWORD error = GetLastError();
200
+ CloseHandle(hFile);
201
+
202
+ if (hMapping == NULL) {
203
+ throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
204
+ }
205
+
206
+ addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
207
+ error = GetLastError();
208
+ CloseHandle(hMapping);
209
+
210
+ if (addr == NULL) {
211
+ throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
212
+ }
213
+
214
+ #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
+ // Advise the kernel to preload the mapped memory
216
+ WIN32_MEMORY_RANGE_ENTRY range;
217
+ range.VirtualAddress = addr;
218
+ range.NumberOfBytes = (SIZE_T)size;
219
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
+ llama_format_win_err(GetLastError()).c_str());
222
+ }
223
+ #else
224
+ #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
225
+ #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
226
+ }
227
+
228
+ ~llama_mmap() {
229
+ if (!UnmapViewOfFile(addr)) {
230
+ fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
231
+ llama_format_win_err(GetLastError()).c_str());
232
+ }
233
+ }
234
+ #else
235
+ static constexpr bool SUPPORTED = false;
236
+
237
+ llama_mmap(struct llama_file *) {
238
+ throw std::string("mmap not supported");
239
+ }
240
+ #endif
241
+ };
242
+
243
+ // Represents some region of memory being locked using mlock or VirtualLock;
244
+ // will automatically unlock on destruction.
245
+ struct llama_mlock {
246
+ void * addr = NULL;
247
+ size_t size = 0;
248
+ bool failed_already = false;
249
+
250
+ llama_mlock() {}
251
+ llama_mlock(const llama_mlock &) = delete;
252
+
253
+ ~llama_mlock() {
254
+ if (size) {
255
+ raw_unlock(addr, size);
256
+ }
257
+ }
258
+
259
+ void init(void * addr) {
260
+ LLAMA_ASSERT(this->addr == NULL && this->size == 0);
261
+ this->addr = addr;
262
+ }
263
+
264
+ void grow_to(size_t target_size) {
265
+ LLAMA_ASSERT(addr);
266
+ if (failed_already) {
267
+ return;
268
+ }
269
+ size_t granularity = lock_granularity();
270
+ target_size = (target_size + granularity - 1) & ~(granularity - 1);
271
+ if (target_size > size) {
272
+ if (raw_lock((uint8_t *) addr + size, target_size - size)) {
273
+ size = target_size;
274
+ } else {
275
+ failed_already = true;
276
+ }
277
+ }
278
+ }
279
+
280
+ #ifdef _POSIX_MEMLOCK_RANGE
281
+ static constexpr bool SUPPORTED = true;
282
+
283
+ size_t lock_granularity() {
284
+ return (size_t) sysconf(_SC_PAGESIZE);
285
+ }
286
+
287
+ #ifdef __APPLE__
288
+ #define MLOCK_SUGGESTION \
289
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
290
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
291
+ #else
292
+ #define MLOCK_SUGGESTION \
293
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
294
+ #endif
295
+
296
+ bool raw_lock(const void * addr, size_t size) {
297
+ if (!mlock(addr, size)) {
298
+ return true;
299
+ } else {
300
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
301
+ size, this->size, std::strerror(errno));
302
+ return false;
303
+ }
304
+ }
305
+
306
+ #undef MLOCK_SUGGESTION
307
+
308
+ void raw_unlock(void * addr, size_t size) {
309
+ if (munlock(addr, size)) {
310
+ fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
311
+ }
312
+ }
313
+ #elif defined(_WIN32)
314
+ static constexpr bool SUPPORTED = true;
315
+
316
+ size_t lock_granularity() {
317
+ SYSTEM_INFO si;
318
+ GetSystemInfo(&si);
319
+ return (size_t) si.dwPageSize;
320
+ }
321
+
322
+ bool raw_lock(void * addr, size_t size) {
323
+ for (int tries = 1; ; tries++) {
324
+ if (VirtualLock(addr, size)) {
325
+ return true;
326
+ }
327
+ if (tries == 2) {
328
+ fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
329
+ size, this->size, llama_format_win_err(GetLastError()).c_str());
330
+ return false;
331
+ }
332
+
333
+ // It failed but this was only the first try; increase the working
334
+ // set size and try again.
335
+ SIZE_T min_ws_size, max_ws_size;
336
+ if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
337
+ fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
338
+ llama_format_win_err(GetLastError()).c_str());
339
+ return false;
340
+ }
341
+ // Per MSDN: "The maximum number of pages that a process can lock
342
+ // is equal to the number of pages in its minimum working set minus
343
+ // a small overhead."
344
+ // Hopefully a megabyte is enough overhead:
345
+ size_t increment = size + 1048576;
346
+ // The minimum must be <= the maximum, so we need to increase both:
347
+ min_ws_size += increment;
348
+ max_ws_size += increment;
349
+ if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
350
+ fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
351
+ llama_format_win_err(GetLastError()).c_str());
352
+ return false;
353
+ }
354
+ }
355
+ }
356
+
357
+ void raw_unlock(void * addr, size_t size) {
358
+ if (!VirtualUnlock(addr, size)) {
359
+ fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
360
+ llama_format_win_err(GetLastError()).c_str());
361
+ }
362
+ }
363
+ #else
364
+ static constexpr bool SUPPORTED = false;
365
+
366
+ void raw_lock(const void * addr, size_t size) {
367
+ fprintf(stderr, "warning: mlock not supported on this system\n");
368
+ }
369
+
370
+ void raw_unlock(const void * addr, size_t size) {}
371
+ #endif
372
+ };
373
+
374
+ // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
375
+ struct llama_buffer {
376
+ uint8_t * addr = NULL;
377
+ size_t size = 0;
378
+
379
+ void resize(size_t size) {
380
+ delete[] addr;
381
+ addr = new uint8_t[size];
382
+ this->size = size;
383
+ }
384
+
385
+ ~llama_buffer() {
386
+ delete[] addr;
387
+ }
388
+ };
389
+ #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.3'
6
+ VERSION = '0.0.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-698f7b5'
9
+ LLAMA_CPP_VERSION = 'master-c85e03d'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -14,8 +14,11 @@ module LLaMACpp
14
14
  public
15
15
 
16
16
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
+ | () -> void
17
18
  def embeddings: () -> Array[Float]
18
19
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
20
+ def free: () -> void
21
+ def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
22
  def logits: () -> Array[Float]
20
23
  def n_ctx: () -> Integer
21
24
  def n_embd: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-08 00:00:00.000000000 Z
11
+ date: 2023-04-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - ext/llama_cpp/src/ggml.h
31
31
  - ext/llama_cpp/src/llama.cpp
32
32
  - ext/llama_cpp/src/llama.h
33
+ - ext/llama_cpp/src/llama_util.h
33
34
  - lib/llama_cpp.rb
34
35
  - lib/llama_cpp/version.rb
35
36
  - sig/llama_cpp.rbs