llama_cpp 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -284,6 +283,20 @@
|
|
284
283
|
const type prefix##3 = (pointer)->array[3]; \
|
285
284
|
GGML_UNUSED(prefix##3);
|
286
285
|
|
286
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
287
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
288
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
289
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
290
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
291
|
+
|
292
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
293
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
294
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
295
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
296
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
297
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
298
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
299
|
+
|
287
300
|
#ifdef __cplusplus
|
288
301
|
extern "C" {
|
289
302
|
#endif
|
@@ -382,6 +395,7 @@ extern "C" {
|
|
382
395
|
GGML_OP_GROUP_NORM,
|
383
396
|
|
384
397
|
GGML_OP_MUL_MAT,
|
398
|
+
GGML_OP_MUL_MAT_ID,
|
385
399
|
GGML_OP_OUT_PROD,
|
386
400
|
|
387
401
|
GGML_OP_SCALE,
|
@@ -408,8 +422,8 @@ extern "C" {
|
|
408
422
|
GGML_OP_CONV_TRANSPOSE_2D,
|
409
423
|
GGML_OP_POOL_1D,
|
410
424
|
GGML_OP_POOL_2D,
|
411
|
-
|
412
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_ARGSORT,
|
413
427
|
|
414
428
|
GGML_OP_FLASH_ATTN,
|
415
429
|
GGML_OP_FLASH_FF,
|
@@ -449,7 +463,9 @@ extern "C" {
|
|
449
463
|
GGML_UNARY_OP_GELU,
|
450
464
|
GGML_UNARY_OP_GELU_QUICK,
|
451
465
|
GGML_UNARY_OP_SILU,
|
452
|
-
GGML_UNARY_OP_LEAKY
|
466
|
+
GGML_UNARY_OP_LEAKY,
|
467
|
+
|
468
|
+
GGML_UNARY_OP_COUNT,
|
453
469
|
};
|
454
470
|
|
455
471
|
enum ggml_object_type {
|
@@ -632,6 +648,9 @@ extern "C" {
|
|
632
648
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
633
649
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
634
650
|
|
651
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
652
|
+
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
653
|
+
|
635
654
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
636
655
|
|
637
656
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
@@ -1028,6 +1047,15 @@ extern "C" {
|
|
1028
1047
|
struct ggml_tensor * a,
|
1029
1048
|
struct ggml_tensor * b);
|
1030
1049
|
|
1050
|
+
// indirect matrix multiplication
|
1051
|
+
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
|
+
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
|
+
struct ggml_context * ctx,
|
1054
|
+
struct ggml_tensor * as[],
|
1055
|
+
struct ggml_tensor * ids,
|
1056
|
+
int id,
|
1057
|
+
struct ggml_tensor * b);
|
1058
|
+
|
1031
1059
|
// A: m columns, n rows,
|
1032
1060
|
// B: p columns, n rows,
|
1033
1061
|
// result is m columns, p rows
|
@@ -1283,6 +1311,14 @@ extern "C" {
|
|
1283
1311
|
struct ggml_context * ctx,
|
1284
1312
|
struct ggml_tensor * a);
|
1285
1313
|
|
1314
|
+
// fused soft_max(a*scale + mask)
|
1315
|
+
// mask is optional
|
1316
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1317
|
+
struct ggml_context * ctx,
|
1318
|
+
struct ggml_tensor * a,
|
1319
|
+
struct ggml_tensor * mask,
|
1320
|
+
float scale);
|
1321
|
+
|
1286
1322
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1323
|
struct ggml_context * ctx,
|
1288
1324
|
struct ggml_tensor * a,
|
@@ -1513,6 +1549,23 @@ extern "C" {
|
|
1513
1549
|
struct ggml_tensor * a,
|
1514
1550
|
int scale_factor);
|
1515
1551
|
|
1552
|
+
// sort rows
|
1553
|
+
enum ggml_sort_order {
|
1554
|
+
GGML_SORT_ASC,
|
1555
|
+
GGML_SORT_DESC,
|
1556
|
+
};
|
1557
|
+
|
1558
|
+
GGML_API struct ggml_tensor * ggml_argsort(
|
1559
|
+
struct ggml_context * ctx,
|
1560
|
+
struct ggml_tensor * a,
|
1561
|
+
enum ggml_sort_order order);
|
1562
|
+
|
1563
|
+
// top k elements per row
|
1564
|
+
GGML_API struct ggml_tensor * ggml_top_k(
|
1565
|
+
struct ggml_context * ctx,
|
1566
|
+
struct ggml_tensor * a,
|
1567
|
+
int k);
|
1568
|
+
|
1516
1569
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1517
1570
|
struct ggml_context * ctx,
|
1518
1571
|
struct ggml_tensor * q,
|
@@ -1574,7 +1627,6 @@ extern "C" {
|
|
1574
1627
|
int kh);
|
1575
1628
|
|
1576
1629
|
// used in sam
|
1577
|
-
|
1578
1630
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1579
1631
|
struct ggml_context * ctx,
|
1580
1632
|
struct ggml_tensor * a,
|
@@ -1749,7 +1801,7 @@ extern "C" {
|
|
1749
1801
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1750
1802
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1751
1803
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1752
|
-
GGML_API struct ggml_cgraph
|
1804
|
+
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
1753
1805
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1754
1806
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1755
1807
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|