llama_cpp 0.0.7 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -197,6 +197,14 @@
|
|
197
197
|
#define GGML_MAX_OPT 4
|
198
198
|
#define GGML_DEFAULT_N_THREADS 4
|
199
199
|
|
200
|
+
#define GGML_ASSERT(x) \
|
201
|
+
do { \
|
202
|
+
if (!(x)) { \
|
203
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
204
|
+
abort(); \
|
205
|
+
} \
|
206
|
+
} while (0)
|
207
|
+
|
200
208
|
#ifdef __cplusplus
|
201
209
|
extern "C" {
|
202
210
|
#endif
|
@@ -212,6 +220,9 @@ extern "C" {
|
|
212
220
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
213
221
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
214
222
|
|
223
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
224
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
225
|
+
|
215
226
|
struct ggml_object;
|
216
227
|
struct ggml_context;
|
217
228
|
|
@@ -221,7 +232,7 @@ extern "C" {
|
|
221
232
|
GGML_TYPE_Q4_0 = 2,
|
222
233
|
GGML_TYPE_Q4_1 = 3,
|
223
234
|
GGML_TYPE_Q4_2 = 4,
|
224
|
-
GGML_TYPE_Q4_3
|
235
|
+
// GGML_TYPE_Q4_3 (5) support has been removed
|
225
236
|
GGML_TYPE_Q5_0 = 6,
|
226
237
|
GGML_TYPE_Q5_1 = 7,
|
227
238
|
GGML_TYPE_Q8_0 = 8,
|
@@ -232,6 +243,20 @@ extern "C" {
|
|
232
243
|
GGML_TYPE_COUNT,
|
233
244
|
};
|
234
245
|
|
246
|
+
// model file types
|
247
|
+
enum ggml_ftype {
|
248
|
+
GGML_FTYPE_UNKNOWN = -1,
|
249
|
+
GGML_FTYPE_ALL_F32 = 0,
|
250
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
251
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
|
+
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
+
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
258
|
+
};
|
259
|
+
|
235
260
|
// available tensor operations:
|
236
261
|
enum ggml_op {
|
237
262
|
GGML_OP_NONE = 0,
|
@@ -269,6 +294,7 @@ extern "C" {
|
|
269
294
|
GGML_OP_DIAG_MASK_INF,
|
270
295
|
GGML_OP_SOFT_MAX,
|
271
296
|
GGML_OP_ROPE,
|
297
|
+
GGML_OP_ALIBI,
|
272
298
|
GGML_OP_CONV_1D_1S,
|
273
299
|
GGML_OP_CONV_1D_2S,
|
274
300
|
|
@@ -324,7 +350,10 @@ extern "C" {
|
|
324
350
|
int64_t perf_time_us;
|
325
351
|
|
326
352
|
void * data;
|
327
|
-
|
353
|
+
|
354
|
+
char name[32];
|
355
|
+
|
356
|
+
char padding[8]; // TODO: remove and add padding to name?
|
328
357
|
};
|
329
358
|
|
330
359
|
// computation graph
|
@@ -384,6 +413,9 @@ extern "C" {
|
|
384
413
|
|
385
414
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
386
415
|
|
416
|
+
// TODO: temporary until model loading of ggml examples is refactored
|
417
|
+
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
418
|
+
|
387
419
|
// main
|
388
420
|
|
389
421
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,6 +476,9 @@ extern "C" {
|
|
444
476
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
445
477
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
446
478
|
|
479
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
481
|
+
|
447
482
|
//
|
448
483
|
// operations on tensors with backpropagation
|
449
484
|
//
|
@@ -662,6 +697,14 @@ extern "C" {
|
|
662
697
|
int n_dims,
|
663
698
|
int mode);
|
664
699
|
|
700
|
+
// alibi position embedding
|
701
|
+
// in-place, returns view(a)
|
702
|
+
struct ggml_tensor * ggml_alibi(
|
703
|
+
struct ggml_context * ctx,
|
704
|
+
struct ggml_tensor * a,
|
705
|
+
int n_past,
|
706
|
+
int n_head);
|
707
|
+
|
665
708
|
// padding = 1
|
666
709
|
// TODO: we don't support extra parameters for now
|
667
710
|
// that's why we are hard-coding the stride, padding, and dilation
|
@@ -692,8 +735,8 @@ extern "C" {
|
|
692
735
|
struct ggml_tensor * c1);
|
693
736
|
|
694
737
|
// Mapping operations
|
695
|
-
|
696
|
-
|
738
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
739
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
697
740
|
|
698
741
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
699
742
|
struct ggml_context * ctx,
|
@@ -834,7 +877,6 @@ extern "C" {
|
|
834
877
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
835
878
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
836
879
|
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
837
|
-
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
838
880
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
839
881
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
840
882
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -74,7 +75,7 @@ struct llama_file {
|
|
74
75
|
llama_file(const char * fname, const char * mode) {
|
75
76
|
fp = std::fopen(fname, mode);
|
76
77
|
if (fp == NULL) {
|
77
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
78
79
|
}
|
79
80
|
seek(0, SEEK_END);
|
80
81
|
size = tell();
|
@@ -107,10 +108,10 @@ struct llama_file {
|
|
107
108
|
errno = 0;
|
108
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
109
110
|
if (ferror(fp)) {
|
110
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
111
112
|
}
|
112
113
|
if (ret != 1) {
|
113
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
114
115
|
}
|
115
116
|
}
|
116
117
|
|
@@ -133,7 +134,7 @@ struct llama_file {
|
|
133
134
|
errno = 0;
|
134
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
135
136
|
if (ret != 1) {
|
136
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
137
138
|
}
|
138
139
|
}
|
139
140
|
|
@@ -180,7 +181,7 @@ struct llama_mmap {
|
|
180
181
|
#endif
|
181
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
182
183
|
if (addr == MAP_FAILED) {
|
183
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
184
185
|
}
|
185
186
|
|
186
187
|
if (prefetch) {
|
@@ -207,7 +208,7 @@ struct llama_mmap {
|
|
207
208
|
DWORD error = GetLastError();
|
208
209
|
|
209
210
|
if (hMapping == NULL) {
|
210
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
211
212
|
}
|
212
213
|
|
213
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -215,7 +216,7 @@ struct llama_mmap {
|
|
215
216
|
CloseHandle(hMapping);
|
216
217
|
|
217
218
|
if (addr == NULL) {
|
218
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
219
220
|
}
|
220
221
|
|
221
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -243,8 +244,9 @@ struct llama_mmap {
|
|
243
244
|
#else
|
244
245
|
static constexpr bool SUPPORTED = false;
|
245
246
|
|
246
|
-
llama_mmap(struct llama_file
|
247
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
248
250
|
}
|
249
251
|
#endif
|
250
252
|
};
|
@@ -382,8 +384,13 @@ struct llama_mlock {
|
|
382
384
|
#else
|
383
385
|
static constexpr bool SUPPORTED = false;
|
384
386
|
|
385
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
386
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
387
394
|
}
|
388
395
|
|
389
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -395,6 +402,8 @@ struct llama_buffer {
|
|
395
402
|
uint8_t * addr = NULL;
|
396
403
|
size_t size = 0;
|
397
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
398
407
|
void resize(size_t size) {
|
399
408
|
delete[] addr;
|
400
409
|
addr = new uint8_t[size];
|
@@ -404,5 +413,62 @@ struct llama_buffer {
|
|
404
413
|
~llama_buffer() {
|
405
414
|
delete[] addr;
|
406
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
407
422
|
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
469
|
+
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
408
474
|
#endif
|