llama_cpp 0.0.6 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +762 -36
- data/ext/llama_cpp/src/ggml-cuda.h +11 -4
- data/ext/llama_cpp/src/ggml-opencl.c +398 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1957 -909
- data/ext/llama_cpp/src/ggml.h +696 -627
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +91 -12
- data/ext/llama_cpp/src/llama.cpp +755 -159
- data/ext/llama_cpp/src/llama.h +85 -34
- data/lib/llama_cpp/client.rb +174 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +43 -11
- data/sig/llama_cpp.rbs +53 -3
- metadata +6 -3
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -21,6 +22,9 @@
|
|
21
22
|
#if defined(_POSIX_MAPPED_FILES)
|
22
23
|
#include <sys/mman.h>
|
23
24
|
#endif
|
25
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
26
|
+
#include <sys/resource.h>
|
27
|
+
#endif
|
24
28
|
#endif
|
25
29
|
#endif
|
26
30
|
|
@@ -71,7 +75,7 @@ struct llama_file {
|
|
71
75
|
llama_file(const char * fname, const char * mode) {
|
72
76
|
fp = std::fopen(fname, mode);
|
73
77
|
if (fp == NULL) {
|
74
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
75
79
|
}
|
76
80
|
seek(0, SEEK_END);
|
77
81
|
size = tell();
|
@@ -104,10 +108,10 @@ struct llama_file {
|
|
104
108
|
errno = 0;
|
105
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
106
110
|
if (ferror(fp)) {
|
107
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
108
112
|
}
|
109
113
|
if (ret != 1) {
|
110
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
111
115
|
}
|
112
116
|
}
|
113
117
|
|
@@ -130,7 +134,7 @@ struct llama_file {
|
|
130
134
|
errno = 0;
|
131
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
132
136
|
if (ret != 1) {
|
133
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
134
138
|
}
|
135
139
|
}
|
136
140
|
|
@@ -177,7 +181,7 @@ struct llama_mmap {
|
|
177
181
|
#endif
|
178
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
179
183
|
if (addr == MAP_FAILED) {
|
180
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
181
185
|
}
|
182
186
|
|
183
187
|
if (prefetch) {
|
@@ -204,7 +208,7 @@ struct llama_mmap {
|
|
204
208
|
DWORD error = GetLastError();
|
205
209
|
|
206
210
|
if (hMapping == NULL) {
|
207
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
208
212
|
}
|
209
213
|
|
210
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -212,7 +216,7 @@ struct llama_mmap {
|
|
212
216
|
CloseHandle(hMapping);
|
213
217
|
|
214
218
|
if (addr == NULL) {
|
215
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
216
220
|
}
|
217
221
|
|
218
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -240,8 +244,9 @@ struct llama_mmap {
|
|
240
244
|
#else
|
241
245
|
static constexpr bool SUPPORTED = false;
|
242
246
|
|
243
|
-
llama_mmap(struct llama_file
|
244
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
245
250
|
}
|
246
251
|
#endif
|
247
252
|
};
|
@@ -303,8 +308,18 @@ struct llama_mlock {
|
|
303
308
|
if (!mlock(addr, size)) {
|
304
309
|
return true;
|
305
310
|
} else {
|
306
|
-
|
307
|
-
|
311
|
+
char* errmsg = std::strerror(errno);
|
312
|
+
bool suggest = (errno == ENOMEM);
|
313
|
+
|
314
|
+
// Check if the resource limit is fine after all
|
315
|
+
struct rlimit lock_limit;
|
316
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
317
|
+
suggest = false;
|
318
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
319
|
+
suggest = false;
|
320
|
+
|
321
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
322
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
308
323
|
return false;
|
309
324
|
}
|
310
325
|
}
|
@@ -369,8 +384,13 @@ struct llama_mlock {
|
|
369
384
|
#else
|
370
385
|
static constexpr bool SUPPORTED = false;
|
371
386
|
|
372
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
373
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
374
394
|
}
|
375
395
|
|
376
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -382,6 +402,8 @@ struct llama_buffer {
|
|
382
402
|
uint8_t * addr = NULL;
|
383
403
|
size_t size = 0;
|
384
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
385
407
|
void resize(size_t size) {
|
386
408
|
delete[] addr;
|
387
409
|
addr = new uint8_t[size];
|
@@ -391,5 +413,62 @@ struct llama_buffer {
|
|
391
413
|
~llama_buffer() {
|
392
414
|
delete[] addr;
|
393
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
422
|
+
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
394
469
|
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
395
474
|
#endif
|