llama_cpp 0.9.5 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -283,6 +283,20 @@
|
|
283
283
|
const type prefix##3 = (pointer)->array[3]; \
|
284
284
|
GGML_UNUSED(prefix##3);
|
285
285
|
|
286
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
287
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
288
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
289
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
290
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
291
|
+
|
292
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
293
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
294
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
295
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
296
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
297
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
298
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
299
|
+
|
286
300
|
#ifdef __cplusplus
|
287
301
|
extern "C" {
|
288
302
|
#endif
|
@@ -381,6 +395,7 @@ extern "C" {
|
|
381
395
|
GGML_OP_GROUP_NORM,
|
382
396
|
|
383
397
|
GGML_OP_MUL_MAT,
|
398
|
+
GGML_OP_MUL_MAT_ID,
|
384
399
|
GGML_OP_OUT_PROD,
|
385
400
|
|
386
401
|
GGML_OP_SCALE,
|
@@ -407,8 +422,10 @@ extern "C" {
|
|
407
422
|
GGML_OP_CONV_TRANSPOSE_2D,
|
408
423
|
GGML_OP_POOL_1D,
|
409
424
|
GGML_OP_POOL_2D,
|
410
|
-
|
411
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_PAD,
|
427
|
+
GGML_OP_ARGSORT,
|
428
|
+
GGML_OP_LEAKY_RELU,
|
412
429
|
|
413
430
|
GGML_OP_FLASH_ATTN,
|
414
431
|
GGML_OP_FLASH_FF,
|
@@ -448,7 +465,8 @@ extern "C" {
|
|
448
465
|
GGML_UNARY_OP_GELU,
|
449
466
|
GGML_UNARY_OP_GELU_QUICK,
|
450
467
|
GGML_UNARY_OP_SILU,
|
451
|
-
|
468
|
+
|
469
|
+
GGML_UNARY_OP_COUNT,
|
452
470
|
};
|
453
471
|
|
454
472
|
enum ggml_object_type {
|
@@ -484,7 +502,6 @@ extern "C" {
|
|
484
502
|
|
485
503
|
struct ggml_backend_buffer * buffer;
|
486
504
|
|
487
|
-
int n_dims;
|
488
505
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
489
506
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
490
507
|
// nb[0] = ggml_type_size(type)
|
@@ -516,7 +533,7 @@ extern "C" {
|
|
516
533
|
|
517
534
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
518
535
|
|
519
|
-
char padding[
|
536
|
+
char padding[8];
|
520
537
|
};
|
521
538
|
|
522
539
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -621,16 +638,22 @@ extern "C" {
|
|
621
638
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
622
639
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
623
640
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
624
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
625
641
|
|
626
|
-
GGML_API int
|
627
|
-
GGML_API size_t
|
628
|
-
GGML_API
|
642
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
643
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
644
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
645
|
+
|
646
|
+
GGML_DEPRECATED(
|
647
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
648
|
+
"use ggml_row_size() instead");
|
629
649
|
|
630
650
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
631
651
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
632
652
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
633
653
|
|
654
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
655
|
+
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
656
|
+
|
634
657
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
635
658
|
|
636
659
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
@@ -641,6 +664,11 @@ extern "C" {
|
|
641
664
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
642
665
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
643
666
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
667
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
668
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
669
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
670
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
671
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
644
672
|
|
645
673
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
646
674
|
|
@@ -773,6 +801,9 @@ extern "C" {
|
|
773
801
|
struct ggml_tensor * a,
|
774
802
|
struct ggml_tensor * b);
|
775
803
|
|
804
|
+
// dst = a
|
805
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
806
|
+
// return dst
|
776
807
|
GGML_API struct ggml_tensor * ggml_acc(
|
777
808
|
struct ggml_context * ctx,
|
778
809
|
struct ggml_tensor * a,
|
@@ -937,15 +968,14 @@ extern "C" {
|
|
937
968
|
struct ggml_context * ctx,
|
938
969
|
struct ggml_tensor * a);
|
939
970
|
|
940
|
-
GGML_API struct ggml_tensor *
|
971
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
941
972
|
struct ggml_context * ctx,
|
942
|
-
struct ggml_tensor * a);
|
973
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
943
974
|
|
944
975
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
945
976
|
struct ggml_context * ctx,
|
946
977
|
struct ggml_tensor * a);
|
947
978
|
|
948
|
-
// TODO: double-check this computation is correct
|
949
979
|
GGML_API struct ggml_tensor * ggml_gelu(
|
950
980
|
struct ggml_context * ctx,
|
951
981
|
struct ggml_tensor * a);
|
@@ -1027,6 +1057,16 @@ extern "C" {
|
|
1027
1057
|
struct ggml_tensor * a,
|
1028
1058
|
struct ggml_tensor * b);
|
1029
1059
|
|
1060
|
+
// indirect matrix multiplication
|
1061
|
+
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1062
|
+
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1063
|
+
struct ggml_context * ctx,
|
1064
|
+
struct ggml_tensor * const as[],
|
1065
|
+
int n_as,
|
1066
|
+
struct ggml_tensor * ids,
|
1067
|
+
int id,
|
1068
|
+
struct ggml_tensor * b);
|
1069
|
+
|
1030
1070
|
// A: m columns, n rows,
|
1031
1071
|
// B: p columns, n rows,
|
1032
1072
|
// result is m columns, p rows
|
@@ -1234,6 +1274,7 @@ extern "C" {
|
|
1234
1274
|
struct ggml_context * ctx,
|
1235
1275
|
struct ggml_tensor * a);
|
1236
1276
|
|
1277
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1237
1278
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1238
1279
|
struct ggml_context * ctx,
|
1239
1280
|
struct ggml_tensor * a,
|
@@ -1520,6 +1561,32 @@ extern "C" {
|
|
1520
1561
|
struct ggml_tensor * a,
|
1521
1562
|
int scale_factor);
|
1522
1563
|
|
1564
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1565
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1566
|
+
struct ggml_context * ctx,
|
1567
|
+
struct ggml_tensor * a,
|
1568
|
+
int p0,
|
1569
|
+
int p1,
|
1570
|
+
int p2,
|
1571
|
+
int p3);
|
1572
|
+
|
1573
|
+
// sort rows
|
1574
|
+
enum ggml_sort_order {
|
1575
|
+
GGML_SORT_ASC,
|
1576
|
+
GGML_SORT_DESC,
|
1577
|
+
};
|
1578
|
+
|
1579
|
+
GGML_API struct ggml_tensor * ggml_argsort(
|
1580
|
+
struct ggml_context * ctx,
|
1581
|
+
struct ggml_tensor * a,
|
1582
|
+
enum ggml_sort_order order);
|
1583
|
+
|
1584
|
+
// top k elements per row
|
1585
|
+
GGML_API struct ggml_tensor * ggml_top_k(
|
1586
|
+
struct ggml_context * ctx,
|
1587
|
+
struct ggml_tensor * a,
|
1588
|
+
int k);
|
1589
|
+
|
1523
1590
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1524
1591
|
struct ggml_context * ctx,
|
1525
1592
|
struct ggml_tensor * q,
|
@@ -1581,7 +1648,6 @@ extern "C" {
|
|
1581
1648
|
int kh);
|
1582
1649
|
|
1583
1650
|
// used in sam
|
1584
|
-
|
1585
1651
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1586
1652
|
struct ggml_context * ctx,
|
1587
1653
|
struct ggml_tensor * a,
|
@@ -1756,7 +1822,7 @@ extern "C" {
|
|
1756
1822
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1757
1823
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1758
1824
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1759
|
-
GGML_API struct ggml_cgraph
|
1825
|
+
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
1760
1826
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1761
1827
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1762
1828
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|