llama_cpp 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +762 -36
- data/ext/llama_cpp/src/ggml-cuda.h +11 -4
- data/ext/llama_cpp/src/ggml-opencl.c +398 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1957 -909
- data/ext/llama_cpp/src/ggml.h +696 -627
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +91 -12
- data/ext/llama_cpp/src/llama.cpp +755 -159
- data/ext/llama_cpp/src/llama.h +85 -34
- data/lib/llama_cpp/client.rb +174 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +43 -11
- data/sig/llama_cpp.rbs +53 -3
- metadata +6 -3
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -21,6 +22,9 @@
|
|
21
22
|
#if defined(_POSIX_MAPPED_FILES)
|
22
23
|
#include <sys/mman.h>
|
23
24
|
#endif
|
25
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
26
|
+
#include <sys/resource.h>
|
27
|
+
#endif
|
24
28
|
#endif
|
25
29
|
#endif
|
26
30
|
|
@@ -71,7 +75,7 @@ struct llama_file {
|
|
71
75
|
llama_file(const char * fname, const char * mode) {
|
72
76
|
fp = std::fopen(fname, mode);
|
73
77
|
if (fp == NULL) {
|
74
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
75
79
|
}
|
76
80
|
seek(0, SEEK_END);
|
77
81
|
size = tell();
|
@@ -104,10 +108,10 @@ struct llama_file {
|
|
104
108
|
errno = 0;
|
105
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
106
110
|
if (ferror(fp)) {
|
107
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
108
112
|
}
|
109
113
|
if (ret != 1) {
|
110
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
111
115
|
}
|
112
116
|
}
|
113
117
|
|
@@ -130,7 +134,7 @@ struct llama_file {
|
|
130
134
|
errno = 0;
|
131
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
132
136
|
if (ret != 1) {
|
133
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
134
138
|
}
|
135
139
|
}
|
136
140
|
|
@@ -177,7 +181,7 @@ struct llama_mmap {
|
|
177
181
|
#endif
|
178
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
179
183
|
if (addr == MAP_FAILED) {
|
180
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
181
185
|
}
|
182
186
|
|
183
187
|
if (prefetch) {
|
@@ -204,7 +208,7 @@ struct llama_mmap {
|
|
204
208
|
DWORD error = GetLastError();
|
205
209
|
|
206
210
|
if (hMapping == NULL) {
|
207
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
208
212
|
}
|
209
213
|
|
210
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -212,7 +216,7 @@ struct llama_mmap {
|
|
212
216
|
CloseHandle(hMapping);
|
213
217
|
|
214
218
|
if (addr == NULL) {
|
215
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
216
220
|
}
|
217
221
|
|
218
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -240,8 +244,9 @@ struct llama_mmap {
|
|
240
244
|
#else
|
241
245
|
static constexpr bool SUPPORTED = false;
|
242
246
|
|
243
|
-
llama_mmap(struct llama_file
|
244
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
245
250
|
}
|
246
251
|
#endif
|
247
252
|
};
|
@@ -303,8 +308,18 @@ struct llama_mlock {
|
|
303
308
|
if (!mlock(addr, size)) {
|
304
309
|
return true;
|
305
310
|
} else {
|
306
|
-
|
307
|
-
|
311
|
+
char* errmsg = std::strerror(errno);
|
312
|
+
bool suggest = (errno == ENOMEM);
|
313
|
+
|
314
|
+
// Check if the resource limit is fine after all
|
315
|
+
struct rlimit lock_limit;
|
316
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
317
|
+
suggest = false;
|
318
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
319
|
+
suggest = false;
|
320
|
+
|
321
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
322
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
308
323
|
return false;
|
309
324
|
}
|
310
325
|
}
|
@@ -369,8 +384,13 @@ struct llama_mlock {
|
|
369
384
|
#else
|
370
385
|
static constexpr bool SUPPORTED = false;
|
371
386
|
|
372
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
373
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
374
394
|
}
|
375
395
|
|
376
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -382,6 +402,8 @@ struct llama_buffer {
|
|
382
402
|
uint8_t * addr = NULL;
|
383
403
|
size_t size = 0;
|
384
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
385
407
|
void resize(size_t size) {
|
386
408
|
delete[] addr;
|
387
409
|
addr = new uint8_t[size];
|
@@ -391,5 +413,62 @@ struct llama_buffer {
|
|
391
413
|
~llama_buffer() {
|
392
414
|
delete[] addr;
|
393
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
422
|
+
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
394
469
|
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
395
474
|
#endif
|