llama_cpp 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/LICENSE.txt +1 -1
  4. data/ext/llama_cpp/extconf.rb +35 -110
  5. data/ext/llama_cpp/llama_cpp.cpp +52 -28
  6. data/lib/llama_cpp/version.rb +2 -2
  7. data/sig/llama_cpp.rbs +3 -1
  8. data/vendor/include/.gitkeep +0 -0
  9. data/vendor/lib/.gitkeep +0 -0
  10. data/vendor/tmp/llama.cpp/Makefile +758 -0
  11. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
  12. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
  13. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
  14. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
  15. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
  16. data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
  17. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
  18. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
  19. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
  20. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
  21. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
  22. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
  23. metadata +30 -27
  24. data/ext/llama_cpp/src/ggml-opencl.h +0 -25
  25. data/ext/llama_cpp/src/llama-util.h +0 -546
  26. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
  27. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
  28. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
  29. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
  30. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
  31. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
  32. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
  33. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
  34. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
  35. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
  36. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
  37. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -1,546 +0,0 @@
1
- // Internal header to be included only by llama.cpp.
2
- // Contains wrappers around OS interfaces.
3
-
4
- #ifndef LLAMA_UTIL_H
5
- #define LLAMA_UTIL_H
6
-
7
- #include <cstdio>
8
- #include <cstdint>
9
- #include <cerrno>
10
- #include <cstring>
11
- #include <cstdarg>
12
- #include <cstdlib>
13
- #include <climits>
14
-
15
- #include <string>
16
- #include <vector>
17
- #include <stdexcept>
18
-
19
- #ifdef __has_include
20
- #if __has_include(<unistd.h>)
21
- #include <unistd.h>
22
- #if defined(_POSIX_MAPPED_FILES)
23
- #include <sys/mman.h>
24
- #endif
25
- #if defined(_POSIX_MEMLOCK_RANGE)
26
- #include <sys/resource.h>
27
- #endif
28
- #endif
29
- #endif
30
-
31
- #if defined(_WIN32)
32
- #define WIN32_LEAN_AND_MEAN
33
- #ifndef NOMINMAX
34
- #define NOMINMAX
35
- #endif
36
- #include <windows.h>
37
- #include <io.h>
38
- #include <stdio.h> // for _fseeki64
39
- #endif
40
-
41
- #define LLAMA_ASSERT(x) \
42
- do { \
43
- if (!(x)) { \
44
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
45
- abort(); \
46
- } \
47
- } while (0)
48
-
49
- #ifdef __GNUC__
50
- #ifdef __MINGW32__
51
- __attribute__((format(gnu_printf, 1, 2)))
52
- #else
53
- __attribute__((format(printf, 1, 2)))
54
- #endif
55
- #endif
56
- static std::string format(const char * fmt, ...) {
57
- va_list ap, ap2;
58
- va_start(ap, fmt);
59
- va_copy(ap2, ap);
60
- int size = vsnprintf(NULL, 0, fmt, ap);
61
- LLAMA_ASSERT(size >= 0 && size < INT_MAX);
62
- std::vector<char> buf(size + 1);
63
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
64
- LLAMA_ASSERT(size2 == size);
65
- va_end(ap2);
66
- va_end(ap);
67
- return std::string(buf.data(), size);
68
- }
69
-
70
- struct llama_file {
71
- // use FILE * so we don't have to re-open the file to mmap
72
- FILE * fp;
73
- size_t size;
74
-
75
- llama_file(const char * fname, const char * mode) {
76
- fp = std::fopen(fname, mode);
77
- if (fp == NULL) {
78
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
79
- }
80
- seek(0, SEEK_END);
81
- size = tell();
82
- seek(0, SEEK_SET);
83
- }
84
-
85
- size_t tell() const {
86
- #ifdef _WIN32
87
- __int64 ret = _ftelli64(fp);
88
- #else
89
- long ret = std::ftell(fp);
90
- #endif
91
- LLAMA_ASSERT(ret != -1); // this really shouldn't fail
92
- return (size_t) ret;
93
- }
94
-
95
- void seek(size_t offset, int whence) {
96
- #ifdef _WIN32
97
- int ret = _fseeki64(fp, (__int64) offset, whence);
98
- #else
99
- int ret = std::fseek(fp, (long) offset, whence);
100
- #endif
101
- LLAMA_ASSERT(ret == 0); // same
102
- }
103
-
104
- void read_raw(void * ptr, size_t len) const {
105
- if (len == 0) {
106
- return;
107
- }
108
- errno = 0;
109
- std::size_t ret = std::fread(ptr, len, 1, fp);
110
- if (ferror(fp)) {
111
- throw std::runtime_error(format("read error: %s", strerror(errno)));
112
- }
113
- if (ret != 1) {
114
- throw std::runtime_error(std::string("unexpectedly reached end of file"));
115
- }
116
- }
117
-
118
- std::uint32_t read_u32() {
119
- std::uint32_t ret;
120
- read_raw(&ret, sizeof(ret));
121
- return ret;
122
- }
123
-
124
- std::string read_string(std::uint32_t len) {
125
- std::vector<char> chars(len);
126
- read_raw(chars.data(), len);
127
- return std::string(chars.data(), len);
128
- }
129
-
130
- void write_raw(const void * ptr, size_t len) const {
131
- if (len == 0) {
132
- return;
133
- }
134
- errno = 0;
135
- size_t ret = std::fwrite(ptr, len, 1, fp);
136
- if (ret != 1) {
137
- throw std::runtime_error(format("write error: %s", strerror(errno)));
138
- }
139
- }
140
-
141
- void write_u32(std::uint32_t val) {
142
- write_raw(&val, sizeof(val));
143
- }
144
-
145
- ~llama_file() {
146
- if (fp) {
147
- std::fclose(fp);
148
- }
149
- }
150
- };
151
-
152
- // llama_context_data
153
- struct llama_data_context {
154
- virtual void write(const void * src, size_t size) = 0;
155
- virtual size_t get_size_written() = 0;
156
- virtual ~llama_data_context() = default;
157
- };
158
-
159
- struct llama_data_buffer_context : llama_data_context {
160
- uint8_t* ptr;
161
- size_t size_written = 0;
162
-
163
- llama_data_buffer_context(uint8_t * p) : ptr(p) {}
164
-
165
- void write(const void * src, size_t size) override {
166
- memcpy(ptr, src, size);
167
- ptr += size;
168
- size_written += size;
169
- }
170
-
171
- size_t get_size_written() override {
172
- return size_written;
173
- }
174
- };
175
-
176
- struct llama_data_file_context : llama_data_context {
177
- llama_file* file;
178
- size_t size_written = 0;
179
-
180
- llama_data_file_context(llama_file * f) : file(f) {}
181
-
182
- void write(const void * src, size_t size) override {
183
- file->write_raw(src, size);
184
- size_written += size;
185
- }
186
-
187
- size_t get_size_written() override {
188
- return size_written;
189
- }
190
- };
191
-
192
- #if defined(_WIN32)
193
- static std::string llama_format_win_err(DWORD err) {
194
- LPSTR buf;
195
- size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
196
- NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
197
- if (!size) {
198
- return "FormatMessageA failed";
199
- }
200
- std::string ret(buf, size);
201
- LocalFree(buf);
202
- return ret;
203
- }
204
- #endif
205
-
206
- struct llama_mmap {
207
- void * addr;
208
- size_t size;
209
-
210
- llama_mmap(const llama_mmap &) = delete;
211
-
212
- #ifdef _POSIX_MAPPED_FILES
213
- static constexpr bool SUPPORTED = true;
214
-
215
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
216
- size = file->size;
217
- int fd = fileno(file->fp);
218
- int flags = MAP_SHARED;
219
- // prefetch/readahead impairs performance on NUMA systems
220
- if (numa) { prefetch = 0; }
221
- #ifdef __linux__
222
- if (prefetch >= file->size) { flags |= MAP_POPULATE; }
223
- #endif
224
- addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
225
- if (addr == MAP_FAILED) {
226
- throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
227
- }
228
-
229
- if (prefetch > 0) {
230
- // Advise the kernel to preload the mapped memory
231
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
232
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
233
- strerror(errno));
234
- }
235
- }
236
- if (numa) {
237
- // advise the kernel not to use readahead
238
- // (because the next page might not belong on the same node)
239
- if (madvise(addr, file->size, MADV_RANDOM)) {
240
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
241
- strerror(errno));
242
- }
243
- }
244
- }
245
-
246
- ~llama_mmap() {
247
- munmap(addr, size);
248
- }
249
- #elif defined(_WIN32)
250
- static constexpr bool SUPPORTED = true;
251
-
252
- llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
253
- (void) numa;
254
-
255
- size = file->size;
256
-
257
- HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
258
-
259
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
260
- DWORD error = GetLastError();
261
-
262
- if (hMapping == NULL) {
263
- throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
264
- }
265
-
266
- addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
267
- error = GetLastError();
268
- CloseHandle(hMapping);
269
-
270
- if (addr == NULL) {
271
- throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
272
- }
273
-
274
- #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
275
- if (prefetch) {
276
- // Advise the kernel to preload the mapped memory
277
-
278
- WIN32_MEMORY_RANGE_ENTRY range;
279
- range.VirtualAddress = addr;
280
-
281
- range.NumberOfBytes = (SIZE_T)size;
282
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
283
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
284
- llama_format_win_err(GetLastError()).c_str());
285
- }
286
- }
287
- #else
288
- #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
289
- #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
290
- }
291
-
292
- ~llama_mmap() {
293
- if (!UnmapViewOfFile(addr)) {
294
- fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
295
- llama_format_win_err(GetLastError()).c_str());
296
- }
297
- }
298
- #else
299
- static constexpr bool SUPPORTED = false;
300
-
301
- llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
302
- (void) prefetch;
303
- (void) numa;
304
-
305
- throw std::runtime_error(std::string("mmap not supported"));
306
- }
307
- #endif
308
- };
309
-
310
- // Represents some region of memory being locked using mlock or VirtualLock;
311
- // will automatically unlock on destruction.
312
- struct llama_mlock {
313
- void * addr = NULL;
314
- size_t size = 0;
315
- bool failed_already = false;
316
-
317
- llama_mlock() {}
318
- llama_mlock(const llama_mlock &) = delete;
319
-
320
- ~llama_mlock() {
321
- if (size) {
322
- raw_unlock(addr, size);
323
- }
324
- }
325
-
326
- void init(void * ptr) {
327
- LLAMA_ASSERT(addr == NULL && size == 0);
328
- addr = ptr;
329
- }
330
-
331
- void grow_to(size_t target_size) {
332
- LLAMA_ASSERT(addr);
333
- if (failed_already) {
334
- return;
335
- }
336
- size_t granularity = lock_granularity();
337
- target_size = (target_size + granularity - 1) & ~(granularity - 1);
338
- if (target_size > size) {
339
- if (raw_lock((uint8_t *) addr + size, target_size - size)) {
340
- size = target_size;
341
- } else {
342
- failed_already = true;
343
- }
344
- }
345
- }
346
-
347
- #ifdef _POSIX_MEMLOCK_RANGE
348
- static constexpr bool SUPPORTED = true;
349
-
350
- size_t lock_granularity() {
351
- return (size_t) sysconf(_SC_PAGESIZE);
352
- }
353
-
354
- #ifdef __APPLE__
355
- #define MLOCK_SUGGESTION \
356
- "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
357
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
358
- #else
359
- #define MLOCK_SUGGESTION \
360
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
361
- #endif
362
-
363
- bool raw_lock(const void * addr, size_t size) {
364
- if (!mlock(addr, size)) {
365
- return true;
366
- } else {
367
- char* errmsg = std::strerror(errno);
368
- bool suggest = (errno == ENOMEM);
369
-
370
- // Check if the resource limit is fine after all
371
- struct rlimit lock_limit;
372
- if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
373
- suggest = false;
374
- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
375
- suggest = false;
376
-
377
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
378
- size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
379
- return false;
380
- }
381
- }
382
-
383
- #undef MLOCK_SUGGESTION
384
-
385
- void raw_unlock(void * addr, size_t size) {
386
- if (munlock(addr, size)) {
387
- fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
388
- }
389
- }
390
- #elif defined(_WIN32)
391
- static constexpr bool SUPPORTED = true;
392
-
393
- size_t lock_granularity() {
394
- SYSTEM_INFO si;
395
- GetSystemInfo(&si);
396
- return (size_t) si.dwPageSize;
397
- }
398
-
399
- bool raw_lock(void * ptr, size_t len) {
400
- for (int tries = 1; ; tries++) {
401
- if (VirtualLock(ptr, len)) {
402
- return true;
403
- }
404
- if (tries == 2) {
405
- fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
406
- len, size, llama_format_win_err(GetLastError()).c_str());
407
- return false;
408
- }
409
-
410
- // It failed but this was only the first try; increase the working
411
- // set size and try again.
412
- SIZE_T min_ws_size, max_ws_size;
413
- if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
414
- fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
415
- llama_format_win_err(GetLastError()).c_str());
416
- return false;
417
- }
418
- // Per MSDN: "The maximum number of pages that a process can lock
419
- // is equal to the number of pages in its minimum working set minus
420
- // a small overhead."
421
- // Hopefully a megabyte is enough overhead:
422
- size_t increment = len + 1048576;
423
- // The minimum must be <= the maximum, so we need to increase both:
424
- min_ws_size += increment;
425
- max_ws_size += increment;
426
- if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
427
- fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
428
- llama_format_win_err(GetLastError()).c_str());
429
- return false;
430
- }
431
- }
432
- }
433
-
434
- void raw_unlock(void * ptr, size_t len) {
435
- if (!VirtualUnlock(ptr, len)) {
436
- fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
437
- llama_format_win_err(GetLastError()).c_str());
438
- }
439
- }
440
- #else
441
- static constexpr bool SUPPORTED = false;
442
-
443
- size_t lock_granularity() {
444
- return (size_t) 65536;
445
- }
446
-
447
- bool raw_lock(const void * addr, size_t len) {
448
- fprintf(stderr, "warning: mlock not supported on this system\n");
449
- return false;
450
- }
451
-
452
- void raw_unlock(const void * addr, size_t len) {}
453
- #endif
454
- };
455
-
456
- // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
457
- struct llama_buffer {
458
- uint8_t * addr = NULL;
459
- size_t size = 0;
460
-
461
- llama_buffer() = default;
462
-
463
- void resize(size_t len) {
464
- #ifdef GGML_USE_METAL
465
- free(addr);
466
- int result = posix_memalign((void **) &addr, getpagesize(), len);
467
- if (result == 0) {
468
- memset(addr, 0, len);
469
- }
470
- else {
471
- addr = NULL;
472
- }
473
- #else
474
- delete[] addr;
475
- addr = new uint8_t[len];
476
- #endif
477
- size = len;
478
- }
479
-
480
- ~llama_buffer() {
481
- #ifdef GGML_USE_METAL
482
- free(addr);
483
- #else
484
- delete[] addr;
485
- #endif
486
- addr = NULL;
487
- }
488
-
489
- // disable copy and move
490
- llama_buffer(const llama_buffer&) = delete;
491
- llama_buffer(llama_buffer&&) = delete;
492
- llama_buffer& operator=(const llama_buffer&) = delete;
493
- llama_buffer& operator=(llama_buffer&&) = delete;
494
- };
495
-
496
- #ifdef GGML_USE_CUBLAS
497
- #include "ggml-cuda.h"
498
- struct llama_ctx_buffer {
499
- uint8_t * addr = NULL;
500
- bool is_cuda;
501
- size_t size = 0;
502
-
503
- llama_ctx_buffer() = default;
504
-
505
- void resize(size_t size) {
506
- free();
507
-
508
- addr = (uint8_t *) ggml_cuda_host_malloc(size);
509
- if (addr) {
510
- is_cuda = true;
511
- }
512
- else {
513
- // fall back to pageable memory
514
- addr = new uint8_t[size];
515
- is_cuda = false;
516
- }
517
- this->size = size;
518
- }
519
-
520
- void free() {
521
- if (addr) {
522
- if (is_cuda) {
523
- ggml_cuda_host_free(addr);
524
- }
525
- else {
526
- delete[] addr;
527
- }
528
- }
529
- addr = NULL;
530
- }
531
-
532
- ~llama_ctx_buffer() {
533
- free();
534
- }
535
-
536
- // disable copy and move
537
- llama_ctx_buffer(const llama_ctx_buffer&) = delete;
538
- llama_ctx_buffer(llama_ctx_buffer&&) = delete;
539
- llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
540
- llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
541
- };
542
- #else
543
- typedef llama_buffer llama_ctx_buffer;
544
- #endif
545
-
546
- #endif