llama_cpp 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +8 -2
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +398 -184
- data/ext/llama_cpp/src/ggml.h +14 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +191 -92
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -361
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,7 +190,7 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
-
#define GGML_QNT_VERSION
|
193
|
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
194
194
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
195
|
|
196
196
|
#define GGML_MAX_DIMS 4
|
@@ -249,6 +249,7 @@ extern "C" {
|
|
249
249
|
enum ggml_backend {
|
250
250
|
GGML_BACKEND_CPU = 0,
|
251
251
|
GGML_BACKEND_CUDA = 1,
|
252
|
+
GGML_BACKEND_CL = 2,
|
252
253
|
};
|
253
254
|
|
254
255
|
// model file types
|
@@ -313,6 +314,7 @@ extern "C" {
|
|
313
314
|
GGML_OP_ROPE,
|
314
315
|
GGML_OP_ROPE_BACK,
|
315
316
|
GGML_OP_ALIBI,
|
317
|
+
GGML_OP_CLAMP,
|
316
318
|
GGML_OP_CONV_1D_1S,
|
317
319
|
GGML_OP_CONV_1D_2S,
|
318
320
|
|
@@ -849,7 +851,7 @@ extern "C" {
|
|
849
851
|
int n_past);
|
850
852
|
|
851
853
|
// in-place, returns view(a)
|
852
|
-
GGML_API struct ggml_tensor *
|
854
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
853
855
|
struct ggml_context * ctx,
|
854
856
|
struct ggml_tensor * a,
|
855
857
|
int n_past);
|
@@ -897,7 +899,16 @@ extern "C" {
|
|
897
899
|
struct ggml_context * ctx,
|
898
900
|
struct ggml_tensor * a,
|
899
901
|
int n_past,
|
900
|
-
int n_head
|
902
|
+
int n_head,
|
903
|
+
float bias_max);
|
904
|
+
|
905
|
+
// clamp
|
906
|
+
// in-place, returns view(a)
|
907
|
+
struct ggml_tensor * ggml_clamp(
|
908
|
+
struct ggml_context * ctx,
|
909
|
+
struct ggml_tensor * a,
|
910
|
+
float min,
|
911
|
+
float max);
|
901
912
|
|
902
913
|
// padding = 1
|
903
914
|
// TODO: we don't support extra parameters for now
|
@@ -101,12 +101,12 @@ struct llama_file {
|
|
101
101
|
LLAMA_ASSERT(ret == 0); // same
|
102
102
|
}
|
103
103
|
|
104
|
-
void read_raw(void * ptr, size_t
|
105
|
-
if (
|
104
|
+
void read_raw(void * ptr, size_t len) const {
|
105
|
+
if (len == 0) {
|
106
106
|
return;
|
107
107
|
}
|
108
108
|
errno = 0;
|
109
|
-
std::size_t ret = std::fread(ptr,
|
109
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
110
110
|
if (ferror(fp)) {
|
111
111
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
112
112
|
}
|
@@ -127,12 +127,12 @@ struct llama_file {
|
|
127
127
|
return std::string(chars.data(), len);
|
128
128
|
}
|
129
129
|
|
130
|
-
void write_raw(const void * ptr, size_t
|
131
|
-
if (
|
130
|
+
void write_raw(const void * ptr, size_t len) const {
|
131
|
+
if (len == 0) {
|
132
132
|
return;
|
133
133
|
}
|
134
134
|
errno = 0;
|
135
|
-
size_t ret = std::fwrite(ptr,
|
135
|
+
size_t ret = std::fwrite(ptr, len, 1, fp);
|
136
136
|
if (ret != 1) {
|
137
137
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
138
138
|
}
|
@@ -172,7 +172,7 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file,
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
@@ -184,9 +184,9 @@ struct llama_mmap {
|
|
184
184
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
185
185
|
}
|
186
186
|
|
187
|
-
if (prefetch) {
|
187
|
+
if (prefetch > 0) {
|
188
188
|
// Advise the kernel to preload the mapped memory
|
189
|
-
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
189
|
+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
190
190
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
191
191
|
strerror(errno));
|
192
192
|
}
|
@@ -267,9 +267,9 @@ struct llama_mlock {
|
|
267
267
|
}
|
268
268
|
}
|
269
269
|
|
270
|
-
void init(void *
|
271
|
-
LLAMA_ASSERT(
|
272
|
-
|
270
|
+
void init(void * ptr) {
|
271
|
+
LLAMA_ASSERT(addr == NULL && size == 0);
|
272
|
+
addr = ptr;
|
273
273
|
}
|
274
274
|
|
275
275
|
void grow_to(size_t target_size) {
|
@@ -340,14 +340,14 @@ struct llama_mlock {
|
|
340
340
|
return (size_t) si.dwPageSize;
|
341
341
|
}
|
342
342
|
|
343
|
-
bool raw_lock(void *
|
343
|
+
bool raw_lock(void * ptr, size_t len) {
|
344
344
|
for (int tries = 1; ; tries++) {
|
345
|
-
if (VirtualLock(
|
345
|
+
if (VirtualLock(ptr, len)) {
|
346
346
|
return true;
|
347
347
|
}
|
348
348
|
if (tries == 2) {
|
349
349
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
350
|
-
|
350
|
+
len, size, llama_format_win_err(GetLastError()).c_str());
|
351
351
|
return false;
|
352
352
|
}
|
353
353
|
|
@@ -363,7 +363,7 @@ struct llama_mlock {
|
|
363
363
|
// is equal to the number of pages in its minimum working set minus
|
364
364
|
// a small overhead."
|
365
365
|
// Hopefully a megabyte is enough overhead:
|
366
|
-
size_t increment =
|
366
|
+
size_t increment = len + 1048576;
|
367
367
|
// The minimum must be <= the maximum, so we need to increase both:
|
368
368
|
min_ws_size += increment;
|
369
369
|
max_ws_size += increment;
|
@@ -375,8 +375,8 @@ struct llama_mlock {
|
|
375
375
|
}
|
376
376
|
}
|
377
377
|
|
378
|
-
void raw_unlock(void *
|
379
|
-
if (!VirtualUnlock(
|
378
|
+
void raw_unlock(void * ptr, size_t len) {
|
379
|
+
if (!VirtualUnlock(ptr, len)) {
|
380
380
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
381
381
|
llama_format_win_err(GetLastError()).c_str());
|
382
382
|
}
|
@@ -388,12 +388,12 @@ struct llama_mlock {
|
|
388
388
|
return (size_t) 65536;
|
389
389
|
}
|
390
390
|
|
391
|
-
bool raw_lock(const void * addr, size_t
|
391
|
+
bool raw_lock(const void * addr, size_t len) {
|
392
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
393
|
return false;
|
394
394
|
}
|
395
395
|
|
396
|
-
void raw_unlock(const void * addr, size_t
|
396
|
+
void raw_unlock(const void * addr, size_t len) {}
|
397
397
|
#endif
|
398
398
|
};
|
399
399
|
|
@@ -404,10 +404,10 @@ struct llama_buffer {
|
|
404
404
|
|
405
405
|
llama_buffer() = default;
|
406
406
|
|
407
|
-
void resize(size_t
|
407
|
+
void resize(size_t len) {
|
408
408
|
delete[] addr;
|
409
|
-
addr = new uint8_t[
|
410
|
-
|
409
|
+
addr = new uint8_t[len];
|
410
|
+
size = len;
|
411
411
|
}
|
412
412
|
|
413
413
|
~llama_buffer() {
|