llama_cpp 0.0.7 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -197,6 +197,14 @@
|
|
197
197
|
#define GGML_MAX_OPT 4
|
198
198
|
#define GGML_DEFAULT_N_THREADS 4
|
199
199
|
|
200
|
+
#define GGML_ASSERT(x) \
|
201
|
+
do { \
|
202
|
+
if (!(x)) { \
|
203
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
204
|
+
abort(); \
|
205
|
+
} \
|
206
|
+
} while (0)
|
207
|
+
|
200
208
|
#ifdef __cplusplus
|
201
209
|
extern "C" {
|
202
210
|
#endif
|
@@ -212,6 +220,9 @@ extern "C" {
|
|
212
220
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
213
221
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
214
222
|
|
223
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
224
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
225
|
+
|
215
226
|
struct ggml_object;
|
216
227
|
struct ggml_context;
|
217
228
|
|
@@ -221,7 +232,7 @@ extern "C" {
|
|
221
232
|
GGML_TYPE_Q4_0 = 2,
|
222
233
|
GGML_TYPE_Q4_1 = 3,
|
223
234
|
GGML_TYPE_Q4_2 = 4,
|
224
|
-
GGML_TYPE_Q4_3
|
235
|
+
// GGML_TYPE_Q4_3 (5) support has been removed
|
225
236
|
GGML_TYPE_Q5_0 = 6,
|
226
237
|
GGML_TYPE_Q5_1 = 7,
|
227
238
|
GGML_TYPE_Q8_0 = 8,
|
@@ -232,6 +243,20 @@ extern "C" {
|
|
232
243
|
GGML_TYPE_COUNT,
|
233
244
|
};
|
234
245
|
|
246
|
+
// model file types
|
247
|
+
enum ggml_ftype {
|
248
|
+
GGML_FTYPE_UNKNOWN = -1,
|
249
|
+
GGML_FTYPE_ALL_F32 = 0,
|
250
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
251
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
|
+
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
+
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
258
|
+
};
|
259
|
+
|
235
260
|
// available tensor operations:
|
236
261
|
enum ggml_op {
|
237
262
|
GGML_OP_NONE = 0,
|
@@ -269,6 +294,7 @@ extern "C" {
|
|
269
294
|
GGML_OP_DIAG_MASK_INF,
|
270
295
|
GGML_OP_SOFT_MAX,
|
271
296
|
GGML_OP_ROPE,
|
297
|
+
GGML_OP_ALIBI,
|
272
298
|
GGML_OP_CONV_1D_1S,
|
273
299
|
GGML_OP_CONV_1D_2S,
|
274
300
|
|
@@ -324,7 +350,10 @@ extern "C" {
|
|
324
350
|
int64_t perf_time_us;
|
325
351
|
|
326
352
|
void * data;
|
327
|
-
|
353
|
+
|
354
|
+
char name[32];
|
355
|
+
|
356
|
+
char padding[8]; // TODO: remove and add padding to name?
|
328
357
|
};
|
329
358
|
|
330
359
|
// computation graph
|
@@ -384,6 +413,9 @@ extern "C" {
|
|
384
413
|
|
385
414
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
386
415
|
|
416
|
+
// TODO: temporary until model loading of ggml examples is refactored
|
417
|
+
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
418
|
+
|
387
419
|
// main
|
388
420
|
|
389
421
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,6 +476,9 @@ extern "C" {
|
|
444
476
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
445
477
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
446
478
|
|
479
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
481
|
+
|
447
482
|
//
|
448
483
|
// operations on tensors with backpropagation
|
449
484
|
//
|
@@ -662,6 +697,14 @@ extern "C" {
|
|
662
697
|
int n_dims,
|
663
698
|
int mode);
|
664
699
|
|
700
|
+
// alibi position embedding
|
701
|
+
// in-place, returns view(a)
|
702
|
+
struct ggml_tensor * ggml_alibi(
|
703
|
+
struct ggml_context * ctx,
|
704
|
+
struct ggml_tensor * a,
|
705
|
+
int n_past,
|
706
|
+
int n_head);
|
707
|
+
|
665
708
|
// padding = 1
|
666
709
|
// TODO: we don't support extra parameters for now
|
667
710
|
// that's why we are hard-coding the stride, padding, and dilation
|
@@ -692,8 +735,8 @@ extern "C" {
|
|
692
735
|
struct ggml_tensor * c1);
|
693
736
|
|
694
737
|
// Mapping operations
|
695
|
-
|
696
|
-
|
738
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
739
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
697
740
|
|
698
741
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
699
742
|
struct ggml_context * ctx,
|
@@ -834,7 +877,6 @@ extern "C" {
|
|
834
877
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
835
878
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
836
879
|
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
837
|
-
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
838
880
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
839
881
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
840
882
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -74,7 +75,7 @@ struct llama_file {
|
|
74
75
|
llama_file(const char * fname, const char * mode) {
|
75
76
|
fp = std::fopen(fname, mode);
|
76
77
|
if (fp == NULL) {
|
77
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
78
79
|
}
|
79
80
|
seek(0, SEEK_END);
|
80
81
|
size = tell();
|
@@ -107,10 +108,10 @@ struct llama_file {
|
|
107
108
|
errno = 0;
|
108
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
109
110
|
if (ferror(fp)) {
|
110
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
111
112
|
}
|
112
113
|
if (ret != 1) {
|
113
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
114
115
|
}
|
115
116
|
}
|
116
117
|
|
@@ -133,7 +134,7 @@ struct llama_file {
|
|
133
134
|
errno = 0;
|
134
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
135
136
|
if (ret != 1) {
|
136
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
137
138
|
}
|
138
139
|
}
|
139
140
|
|
@@ -180,7 +181,7 @@ struct llama_mmap {
|
|
180
181
|
#endif
|
181
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
182
183
|
if (addr == MAP_FAILED) {
|
183
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
184
185
|
}
|
185
186
|
|
186
187
|
if (prefetch) {
|
@@ -207,7 +208,7 @@ struct llama_mmap {
|
|
207
208
|
DWORD error = GetLastError();
|
208
209
|
|
209
210
|
if (hMapping == NULL) {
|
210
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
211
212
|
}
|
212
213
|
|
213
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -215,7 +216,7 @@ struct llama_mmap {
|
|
215
216
|
CloseHandle(hMapping);
|
216
217
|
|
217
218
|
if (addr == NULL) {
|
218
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
219
220
|
}
|
220
221
|
|
221
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -243,8 +244,9 @@ struct llama_mmap {
|
|
243
244
|
#else
|
244
245
|
static constexpr bool SUPPORTED = false;
|
245
246
|
|
246
|
-
llama_mmap(struct llama_file
|
247
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
248
250
|
}
|
249
251
|
#endif
|
250
252
|
};
|
@@ -382,8 +384,13 @@ struct llama_mlock {
|
|
382
384
|
#else
|
383
385
|
static constexpr bool SUPPORTED = false;
|
384
386
|
|
385
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
386
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
387
394
|
}
|
388
395
|
|
389
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -395,6 +402,8 @@ struct llama_buffer {
|
|
395
402
|
uint8_t * addr = NULL;
|
396
403
|
size_t size = 0;
|
397
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
398
407
|
void resize(size_t size) {
|
399
408
|
delete[] addr;
|
400
409
|
addr = new uint8_t[size];
|
@@ -404,5 +413,62 @@ struct llama_buffer {
|
|
404
413
|
~llama_buffer() {
|
405
414
|
delete[] addr;
|
406
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
407
422
|
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
469
|
+
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
408
474
|
#endif
|