llama_cpp 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -283,6 +283,20 @@
|
|
283
283
|
const type prefix##3 = (pointer)->array[3]; \
|
284
284
|
GGML_UNUSED(prefix##3);
|
285
285
|
|
286
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
287
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
288
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
289
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
290
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
291
|
+
|
292
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
293
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
294
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
295
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
296
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
297
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
298
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
299
|
+
|
286
300
|
#ifdef __cplusplus
|
287
301
|
extern "C" {
|
288
302
|
#endif
|
@@ -381,6 +395,7 @@ extern "C" {
|
|
381
395
|
GGML_OP_GROUP_NORM,
|
382
396
|
|
383
397
|
GGML_OP_MUL_MAT,
|
398
|
+
GGML_OP_MUL_MAT_ID,
|
384
399
|
GGML_OP_OUT_PROD,
|
385
400
|
|
386
401
|
GGML_OP_SCALE,
|
@@ -407,8 +422,10 @@ extern "C" {
|
|
407
422
|
GGML_OP_CONV_TRANSPOSE_2D,
|
408
423
|
GGML_OP_POOL_1D,
|
409
424
|
GGML_OP_POOL_2D,
|
410
|
-
|
411
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_PAD,
|
427
|
+
GGML_OP_ARGSORT,
|
428
|
+
GGML_OP_LEAKY_RELU,
|
412
429
|
|
413
430
|
GGML_OP_FLASH_ATTN,
|
414
431
|
GGML_OP_FLASH_FF,
|
@@ -448,7 +465,8 @@ extern "C" {
|
|
448
465
|
GGML_UNARY_OP_GELU,
|
449
466
|
GGML_UNARY_OP_GELU_QUICK,
|
450
467
|
GGML_UNARY_OP_SILU,
|
451
|
-
|
468
|
+
|
469
|
+
GGML_UNARY_OP_COUNT,
|
452
470
|
};
|
453
471
|
|
454
472
|
enum ggml_object_type {
|
@@ -484,7 +502,6 @@ extern "C" {
|
|
484
502
|
|
485
503
|
struct ggml_backend_buffer * buffer;
|
486
504
|
|
487
|
-
int n_dims;
|
488
505
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
489
506
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
490
507
|
// nb[0] = ggml_type_size(type)
|
@@ -516,7 +533,7 @@ extern "C" {
|
|
516
533
|
|
517
534
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
518
535
|
|
519
|
-
char padding[
|
536
|
+
char padding[8];
|
520
537
|
};
|
521
538
|
|
522
539
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -621,16 +638,22 @@ extern "C" {
|
|
621
638
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
622
639
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
623
640
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
624
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
625
641
|
|
626
|
-
GGML_API int
|
627
|
-
GGML_API size_t
|
628
|
-
GGML_API
|
642
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
643
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
644
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
645
|
+
|
646
|
+
GGML_DEPRECATED(
|
647
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
648
|
+
"use ggml_row_size() instead");
|
629
649
|
|
630
650
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
631
651
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
632
652
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
633
653
|
|
654
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
655
|
+
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
656
|
+
|
634
657
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
635
658
|
|
636
659
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
@@ -641,6 +664,11 @@ extern "C" {
|
|
641
664
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
642
665
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
643
666
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
667
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
668
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
669
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
670
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
671
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
644
672
|
|
645
673
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
646
674
|
|
@@ -773,6 +801,9 @@ extern "C" {
|
|
773
801
|
struct ggml_tensor * a,
|
774
802
|
struct ggml_tensor * b);
|
775
803
|
|
804
|
+
// dst = a
|
805
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
806
|
+
// return dst
|
776
807
|
GGML_API struct ggml_tensor * ggml_acc(
|
777
808
|
struct ggml_context * ctx,
|
778
809
|
struct ggml_tensor * a,
|
@@ -937,15 +968,14 @@ extern "C" {
|
|
937
968
|
struct ggml_context * ctx,
|
938
969
|
struct ggml_tensor * a);
|
939
970
|
|
940
|
-
GGML_API struct ggml_tensor *
|
971
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
941
972
|
struct ggml_context * ctx,
|
942
|
-
struct ggml_tensor * a);
|
973
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
943
974
|
|
944
975
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
945
976
|
struct ggml_context * ctx,
|
946
977
|
struct ggml_tensor * a);
|
947
978
|
|
948
|
-
// TODO: double-check this computation is correct
|
949
979
|
GGML_API struct ggml_tensor * ggml_gelu(
|
950
980
|
struct ggml_context * ctx,
|
951
981
|
struct ggml_tensor * a);
|
@@ -1027,6 +1057,16 @@ extern "C" {
|
|
1027
1057
|
struct ggml_tensor * a,
|
1028
1058
|
struct ggml_tensor * b);
|
1029
1059
|
|
1060
|
+
// indirect matrix multiplication
|
1061
|
+
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1062
|
+
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1063
|
+
struct ggml_context * ctx,
|
1064
|
+
struct ggml_tensor * const as[],
|
1065
|
+
int n_as,
|
1066
|
+
struct ggml_tensor * ids,
|
1067
|
+
int id,
|
1068
|
+
struct ggml_tensor * b);
|
1069
|
+
|
1030
1070
|
// A: m columns, n rows,
|
1031
1071
|
// B: p columns, n rows,
|
1032
1072
|
// result is m columns, p rows
|
@@ -1234,6 +1274,7 @@ extern "C" {
|
|
1234
1274
|
struct ggml_context * ctx,
|
1235
1275
|
struct ggml_tensor * a);
|
1236
1276
|
|
1277
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1237
1278
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1238
1279
|
struct ggml_context * ctx,
|
1239
1280
|
struct ggml_tensor * a,
|
@@ -1520,6 +1561,32 @@ extern "C" {
|
|
1520
1561
|
struct ggml_tensor * a,
|
1521
1562
|
int scale_factor);
|
1522
1563
|
|
1564
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1565
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1566
|
+
struct ggml_context * ctx,
|
1567
|
+
struct ggml_tensor * a,
|
1568
|
+
int p0,
|
1569
|
+
int p1,
|
1570
|
+
int p2,
|
1571
|
+
int p3);
|
1572
|
+
|
1573
|
+
// sort rows
|
1574
|
+
enum ggml_sort_order {
|
1575
|
+
GGML_SORT_ASC,
|
1576
|
+
GGML_SORT_DESC,
|
1577
|
+
};
|
1578
|
+
|
1579
|
+
GGML_API struct ggml_tensor * ggml_argsort(
|
1580
|
+
struct ggml_context * ctx,
|
1581
|
+
struct ggml_tensor * a,
|
1582
|
+
enum ggml_sort_order order);
|
1583
|
+
|
1584
|
+
// top k elements per row
|
1585
|
+
GGML_API struct ggml_tensor * ggml_top_k(
|
1586
|
+
struct ggml_context * ctx,
|
1587
|
+
struct ggml_tensor * a,
|
1588
|
+
int k);
|
1589
|
+
|
1523
1590
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1524
1591
|
struct ggml_context * ctx,
|
1525
1592
|
struct ggml_tensor * q,
|
@@ -1581,7 +1648,6 @@ extern "C" {
|
|
1581
1648
|
int kh);
|
1582
1649
|
|
1583
1650
|
// used in sam
|
1584
|
-
|
1585
1651
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1586
1652
|
struct ggml_context * ctx,
|
1587
1653
|
struct ggml_tensor * a,
|
@@ -1756,7 +1822,7 @@ extern "C" {
|
|
1756
1822
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1757
1823
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1758
1824
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1759
|
-
GGML_API struct ggml_cgraph
|
1825
|
+
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
1760
1826
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1761
1827
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1762
1828
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|