llama_cpp 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +18 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +952 -232
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +725 -98
- data/ext/llama_cpp/src/ggml-metal.metal +1508 -171
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +554 -215
- data/ext/llama_cpp/src/ggml.h +58 -23
- data/ext/llama_cpp/src/llama.cpp +1157 -851
- data/ext/llama_cpp/src/llama.h +9 -4
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -303,7 +303,7 @@ extern "C" {
|
|
303
303
|
|
304
304
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
305
305
|
typedef half ggml_fp16_t;
|
306
|
-
#elif defined(__ARM_NEON)
|
306
|
+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
307
307
|
typedef __fp16 ggml_fp16_t;
|
308
308
|
#else
|
309
309
|
typedef uint16_t ggml_fp16_t;
|
@@ -343,6 +343,12 @@ extern "C" {
|
|
343
343
|
GGML_TYPE_COUNT,
|
344
344
|
};
|
345
345
|
|
346
|
+
// precision
|
347
|
+
enum ggml_prec {
|
348
|
+
GGML_PREC_DEFAULT,
|
349
|
+
GGML_PREC_F32,
|
350
|
+
};
|
351
|
+
|
346
352
|
enum ggml_backend_type {
|
347
353
|
GGML_BACKEND_CPU = 0,
|
348
354
|
GGML_BACKEND_GPU = 10,
|
@@ -423,7 +429,9 @@ extern "C" {
|
|
423
429
|
GGML_OP_POOL_1D,
|
424
430
|
GGML_OP_POOL_2D,
|
425
431
|
GGML_OP_UPSCALE, // nearest interpolate
|
432
|
+
GGML_OP_PAD,
|
426
433
|
GGML_OP_ARGSORT,
|
434
|
+
GGML_OP_LEAKY_RELU,
|
427
435
|
|
428
436
|
GGML_OP_FLASH_ATTN,
|
429
437
|
GGML_OP_FLASH_FF,
|
@@ -463,7 +471,6 @@ extern "C" {
|
|
463
471
|
GGML_UNARY_OP_GELU,
|
464
472
|
GGML_UNARY_OP_GELU_QUICK,
|
465
473
|
GGML_UNARY_OP_SILU,
|
466
|
-
GGML_UNARY_OP_LEAKY,
|
467
474
|
|
468
475
|
GGML_UNARY_OP_COUNT,
|
469
476
|
};
|
@@ -477,7 +484,8 @@ extern "C" {
|
|
477
484
|
enum ggml_log_level {
|
478
485
|
GGML_LOG_LEVEL_ERROR = 2,
|
479
486
|
GGML_LOG_LEVEL_WARN = 3,
|
480
|
-
GGML_LOG_LEVEL_INFO = 4
|
487
|
+
GGML_LOG_LEVEL_INFO = 4,
|
488
|
+
GGML_LOG_LEVEL_DEBUG = 5
|
481
489
|
};
|
482
490
|
|
483
491
|
// ggml object
|
@@ -501,7 +509,6 @@ extern "C" {
|
|
501
509
|
|
502
510
|
struct ggml_backend_buffer * buffer;
|
503
511
|
|
504
|
-
int n_dims;
|
505
512
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
506
513
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
507
514
|
// nb[0] = ggml_type_size(type)
|
@@ -533,7 +540,7 @@ extern "C" {
|
|
533
540
|
|
534
541
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
535
542
|
|
536
|
-
char padding[
|
543
|
+
char padding[8];
|
537
544
|
};
|
538
545
|
|
539
546
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -638,11 +645,14 @@ extern "C" {
|
|
638
645
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
639
646
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
640
647
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
641
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
642
648
|
|
643
|
-
GGML_API int
|
644
|
-
GGML_API size_t
|
645
|
-
GGML_API
|
649
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
650
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
651
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
652
|
+
|
653
|
+
GGML_DEPRECATED(
|
654
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
655
|
+
"use ggml_row_size() instead");
|
646
656
|
|
647
657
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
648
658
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
@@ -661,6 +671,11 @@ extern "C" {
|
|
661
671
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
662
672
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
663
673
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
674
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
675
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
676
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
677
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
678
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
664
679
|
|
665
680
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
666
681
|
|
@@ -721,8 +736,8 @@ extern "C" {
|
|
721
736
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
722
737
|
|
723
738
|
// Context tensor enumeration and lookup
|
724
|
-
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
725
|
-
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
739
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
740
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
726
741
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
727
742
|
|
728
743
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -793,6 +808,9 @@ extern "C" {
|
|
793
808
|
struct ggml_tensor * a,
|
794
809
|
struct ggml_tensor * b);
|
795
810
|
|
811
|
+
// dst = a
|
812
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
813
|
+
// return dst
|
796
814
|
GGML_API struct ggml_tensor * ggml_acc(
|
797
815
|
struct ggml_context * ctx,
|
798
816
|
struct ggml_tensor * a,
|
@@ -957,15 +975,14 @@ extern "C" {
|
|
957
975
|
struct ggml_context * ctx,
|
958
976
|
struct ggml_tensor * a);
|
959
977
|
|
960
|
-
GGML_API struct ggml_tensor *
|
978
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
961
979
|
struct ggml_context * ctx,
|
962
|
-
struct ggml_tensor * a);
|
980
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
963
981
|
|
964
982
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
965
983
|
struct ggml_context * ctx,
|
966
984
|
struct ggml_tensor * a);
|
967
985
|
|
968
|
-
// TODO: double-check this computation is correct
|
969
986
|
GGML_API struct ggml_tensor * ggml_gelu(
|
970
987
|
struct ggml_context * ctx,
|
971
988
|
struct ggml_tensor * a);
|
@@ -1047,11 +1064,18 @@ extern "C" {
|
|
1047
1064
|
struct ggml_tensor * a,
|
1048
1065
|
struct ggml_tensor * b);
|
1049
1066
|
|
1067
|
+
// change the precision of a matrix multiplication
|
1068
|
+
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
1069
|
+
GGML_API void ggml_mul_mat_set_prec(
|
1070
|
+
struct ggml_tensor * a,
|
1071
|
+
enum ggml_prec prec);
|
1072
|
+
|
1050
1073
|
// indirect matrix multiplication
|
1051
1074
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
1075
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
1076
|
struct ggml_context * ctx,
|
1054
|
-
struct ggml_tensor * as[],
|
1077
|
+
struct ggml_tensor * const as[],
|
1078
|
+
int n_as,
|
1055
1079
|
struct ggml_tensor * ids,
|
1056
1080
|
int id,
|
1057
1081
|
struct ggml_tensor * b);
|
@@ -1071,13 +1095,13 @@ extern "C" {
|
|
1071
1095
|
GGML_API struct ggml_tensor * ggml_scale(
|
1072
1096
|
struct ggml_context * ctx,
|
1073
1097
|
struct ggml_tensor * a,
|
1074
|
-
|
1098
|
+
float s);
|
1075
1099
|
|
1076
1100
|
// in-place, returns view(a)
|
1077
1101
|
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
1078
1102
|
struct ggml_context * ctx,
|
1079
1103
|
struct ggml_tensor * a,
|
1080
|
-
|
1104
|
+
float s);
|
1081
1105
|
|
1082
1106
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
1083
1107
|
GGML_API struct ggml_tensor * ggml_set(
|
@@ -1263,6 +1287,7 @@ extern "C" {
|
|
1263
1287
|
struct ggml_context * ctx,
|
1264
1288
|
struct ggml_tensor * a);
|
1265
1289
|
|
1290
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1266
1291
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1267
1292
|
struct ggml_context * ctx,
|
1268
1293
|
struct ggml_tensor * a,
|
@@ -1549,6 +1574,15 @@ extern "C" {
|
|
1549
1574
|
struct ggml_tensor * a,
|
1550
1575
|
int scale_factor);
|
1551
1576
|
|
1577
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1578
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1579
|
+
struct ggml_context * ctx,
|
1580
|
+
struct ggml_tensor * a,
|
1581
|
+
int p0,
|
1582
|
+
int p1,
|
1583
|
+
int p2,
|
1584
|
+
int p3);
|
1585
|
+
|
1552
1586
|
// sort rows
|
1553
1587
|
enum ggml_sort_order {
|
1554
1588
|
GGML_SORT_ASC,
|
@@ -2102,10 +2136,11 @@ extern "C" {
|
|
2102
2136
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2103
2137
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
2104
2138
|
|
2105
|
-
GGML_API int
|
2106
|
-
GGML_API int
|
2107
|
-
GGML_API size_t
|
2108
|
-
GGML_API char *
|
2139
|
+
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
2140
|
+
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
2141
|
+
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
2142
|
+
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2143
|
+
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2109
2144
|
|
2110
2145
|
// overrides existing values or adds a new one
|
2111
2146
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|