llama_cpp 0.10.0 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +18 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +952 -232
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +725 -98
- data/ext/llama_cpp/src/ggml-metal.metal +1508 -171
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +554 -215
- data/ext/llama_cpp/src/ggml.h +58 -23
- data/ext/llama_cpp/src/llama.cpp +1157 -851
- data/ext/llama_cpp/src/llama.h +9 -4
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -303,7 +303,7 @@ extern "C" {
|
|
303
303
|
|
304
304
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
305
305
|
typedef half ggml_fp16_t;
|
306
|
-
#elif defined(__ARM_NEON)
|
306
|
+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
307
307
|
typedef __fp16 ggml_fp16_t;
|
308
308
|
#else
|
309
309
|
typedef uint16_t ggml_fp16_t;
|
@@ -343,6 +343,12 @@ extern "C" {
|
|
343
343
|
GGML_TYPE_COUNT,
|
344
344
|
};
|
345
345
|
|
346
|
+
// precision
|
347
|
+
enum ggml_prec {
|
348
|
+
GGML_PREC_DEFAULT,
|
349
|
+
GGML_PREC_F32,
|
350
|
+
};
|
351
|
+
|
346
352
|
enum ggml_backend_type {
|
347
353
|
GGML_BACKEND_CPU = 0,
|
348
354
|
GGML_BACKEND_GPU = 10,
|
@@ -423,7 +429,9 @@ extern "C" {
|
|
423
429
|
GGML_OP_POOL_1D,
|
424
430
|
GGML_OP_POOL_2D,
|
425
431
|
GGML_OP_UPSCALE, // nearest interpolate
|
432
|
+
GGML_OP_PAD,
|
426
433
|
GGML_OP_ARGSORT,
|
434
|
+
GGML_OP_LEAKY_RELU,
|
427
435
|
|
428
436
|
GGML_OP_FLASH_ATTN,
|
429
437
|
GGML_OP_FLASH_FF,
|
@@ -463,7 +471,6 @@ extern "C" {
|
|
463
471
|
GGML_UNARY_OP_GELU,
|
464
472
|
GGML_UNARY_OP_GELU_QUICK,
|
465
473
|
GGML_UNARY_OP_SILU,
|
466
|
-
GGML_UNARY_OP_LEAKY,
|
467
474
|
|
468
475
|
GGML_UNARY_OP_COUNT,
|
469
476
|
};
|
@@ -477,7 +484,8 @@ extern "C" {
|
|
477
484
|
enum ggml_log_level {
|
478
485
|
GGML_LOG_LEVEL_ERROR = 2,
|
479
486
|
GGML_LOG_LEVEL_WARN = 3,
|
480
|
-
GGML_LOG_LEVEL_INFO = 4
|
487
|
+
GGML_LOG_LEVEL_INFO = 4,
|
488
|
+
GGML_LOG_LEVEL_DEBUG = 5
|
481
489
|
};
|
482
490
|
|
483
491
|
// ggml object
|
@@ -501,7 +509,6 @@ extern "C" {
|
|
501
509
|
|
502
510
|
struct ggml_backend_buffer * buffer;
|
503
511
|
|
504
|
-
int n_dims;
|
505
512
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
506
513
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
507
514
|
// nb[0] = ggml_type_size(type)
|
@@ -533,7 +540,7 @@ extern "C" {
|
|
533
540
|
|
534
541
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
535
542
|
|
536
|
-
char padding[
|
543
|
+
char padding[8];
|
537
544
|
};
|
538
545
|
|
539
546
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -638,11 +645,14 @@ extern "C" {
|
|
638
645
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
639
646
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
640
647
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
641
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
642
648
|
|
643
|
-
GGML_API int
|
644
|
-
GGML_API size_t
|
645
|
-
GGML_API
|
649
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
650
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
651
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
652
|
+
|
653
|
+
GGML_DEPRECATED(
|
654
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
655
|
+
"use ggml_row_size() instead");
|
646
656
|
|
647
657
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
648
658
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
@@ -661,6 +671,11 @@ extern "C" {
|
|
661
671
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
662
672
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
663
673
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
674
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
675
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
676
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
677
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
678
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
664
679
|
|
665
680
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
666
681
|
|
@@ -721,8 +736,8 @@ extern "C" {
|
|
721
736
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
722
737
|
|
723
738
|
// Context tensor enumeration and lookup
|
724
|
-
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
725
|
-
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
739
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
740
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
726
741
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
727
742
|
|
728
743
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -793,6 +808,9 @@ extern "C" {
|
|
793
808
|
struct ggml_tensor * a,
|
794
809
|
struct ggml_tensor * b);
|
795
810
|
|
811
|
+
// dst = a
|
812
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
813
|
+
// return dst
|
796
814
|
GGML_API struct ggml_tensor * ggml_acc(
|
797
815
|
struct ggml_context * ctx,
|
798
816
|
struct ggml_tensor * a,
|
@@ -957,15 +975,14 @@ extern "C" {
|
|
957
975
|
struct ggml_context * ctx,
|
958
976
|
struct ggml_tensor * a);
|
959
977
|
|
960
|
-
GGML_API struct ggml_tensor *
|
978
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
961
979
|
struct ggml_context * ctx,
|
962
|
-
struct ggml_tensor * a);
|
980
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
963
981
|
|
964
982
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
965
983
|
struct ggml_context * ctx,
|
966
984
|
struct ggml_tensor * a);
|
967
985
|
|
968
|
-
// TODO: double-check this computation is correct
|
969
986
|
GGML_API struct ggml_tensor * ggml_gelu(
|
970
987
|
struct ggml_context * ctx,
|
971
988
|
struct ggml_tensor * a);
|
@@ -1047,11 +1064,18 @@ extern "C" {
|
|
1047
1064
|
struct ggml_tensor * a,
|
1048
1065
|
struct ggml_tensor * b);
|
1049
1066
|
|
1067
|
+
// change the precision of a matrix multiplication
|
1068
|
+
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
1069
|
+
GGML_API void ggml_mul_mat_set_prec(
|
1070
|
+
struct ggml_tensor * a,
|
1071
|
+
enum ggml_prec prec);
|
1072
|
+
|
1050
1073
|
// indirect matrix multiplication
|
1051
1074
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
1075
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
1076
|
struct ggml_context * ctx,
|
1054
|
-
struct ggml_tensor * as[],
|
1077
|
+
struct ggml_tensor * const as[],
|
1078
|
+
int n_as,
|
1055
1079
|
struct ggml_tensor * ids,
|
1056
1080
|
int id,
|
1057
1081
|
struct ggml_tensor * b);
|
@@ -1071,13 +1095,13 @@ extern "C" {
|
|
1071
1095
|
GGML_API struct ggml_tensor * ggml_scale(
|
1072
1096
|
struct ggml_context * ctx,
|
1073
1097
|
struct ggml_tensor * a,
|
1074
|
-
|
1098
|
+
float s);
|
1075
1099
|
|
1076
1100
|
// in-place, returns view(a)
|
1077
1101
|
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
1078
1102
|
struct ggml_context * ctx,
|
1079
1103
|
struct ggml_tensor * a,
|
1080
|
-
|
1104
|
+
float s);
|
1081
1105
|
|
1082
1106
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
1083
1107
|
GGML_API struct ggml_tensor * ggml_set(
|
@@ -1263,6 +1287,7 @@ extern "C" {
|
|
1263
1287
|
struct ggml_context * ctx,
|
1264
1288
|
struct ggml_tensor * a);
|
1265
1289
|
|
1290
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1266
1291
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1267
1292
|
struct ggml_context * ctx,
|
1268
1293
|
struct ggml_tensor * a,
|
@@ -1549,6 +1574,15 @@ extern "C" {
|
|
1549
1574
|
struct ggml_tensor * a,
|
1550
1575
|
int scale_factor);
|
1551
1576
|
|
1577
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1578
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1579
|
+
struct ggml_context * ctx,
|
1580
|
+
struct ggml_tensor * a,
|
1581
|
+
int p0,
|
1582
|
+
int p1,
|
1583
|
+
int p2,
|
1584
|
+
int p3);
|
1585
|
+
|
1552
1586
|
// sort rows
|
1553
1587
|
enum ggml_sort_order {
|
1554
1588
|
GGML_SORT_ASC,
|
@@ -2102,10 +2136,11 @@ extern "C" {
|
|
2102
2136
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2103
2137
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
2104
2138
|
|
2105
|
-
GGML_API int
|
2106
|
-
GGML_API int
|
2107
|
-
GGML_API size_t
|
2108
|
-
GGML_API char *
|
2139
|
+
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
2140
|
+
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
2141
|
+
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
2142
|
+
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2143
|
+
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2109
2144
|
|
2110
2145
|
// overrides existing values or adds a new one
|
2111
2146
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|