llama_cpp 0.9.4 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -284,6 +283,20 @@
|
|
284
283
|
const type prefix##3 = (pointer)->array[3]; \
|
285
284
|
GGML_UNUSED(prefix##3);
|
286
285
|
|
286
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
287
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
288
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
289
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
290
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
291
|
+
|
292
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
293
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
294
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
295
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
296
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
297
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
298
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
299
|
+
|
287
300
|
#ifdef __cplusplus
|
288
301
|
extern "C" {
|
289
302
|
#endif
|
@@ -382,6 +395,7 @@ extern "C" {
|
|
382
395
|
GGML_OP_GROUP_NORM,
|
383
396
|
|
384
397
|
GGML_OP_MUL_MAT,
|
398
|
+
GGML_OP_MUL_MAT_ID,
|
385
399
|
GGML_OP_OUT_PROD,
|
386
400
|
|
387
401
|
GGML_OP_SCALE,
|
@@ -408,8 +422,8 @@ extern "C" {
|
|
408
422
|
GGML_OP_CONV_TRANSPOSE_2D,
|
409
423
|
GGML_OP_POOL_1D,
|
410
424
|
GGML_OP_POOL_2D,
|
411
|
-
|
412
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_ARGSORT,
|
413
427
|
|
414
428
|
GGML_OP_FLASH_ATTN,
|
415
429
|
GGML_OP_FLASH_FF,
|
@@ -449,7 +463,9 @@ extern "C" {
|
|
449
463
|
GGML_UNARY_OP_GELU,
|
450
464
|
GGML_UNARY_OP_GELU_QUICK,
|
451
465
|
GGML_UNARY_OP_SILU,
|
452
|
-
GGML_UNARY_OP_LEAKY
|
466
|
+
GGML_UNARY_OP_LEAKY,
|
467
|
+
|
468
|
+
GGML_UNARY_OP_COUNT,
|
453
469
|
};
|
454
470
|
|
455
471
|
enum ggml_object_type {
|
@@ -632,6 +648,9 @@ extern "C" {
|
|
632
648
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
633
649
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
634
650
|
|
651
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
652
|
+
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
653
|
+
|
635
654
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
636
655
|
|
637
656
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
@@ -1028,6 +1047,15 @@ extern "C" {
|
|
1028
1047
|
struct ggml_tensor * a,
|
1029
1048
|
struct ggml_tensor * b);
|
1030
1049
|
|
1050
|
+
// indirect matrix multiplication
|
1051
|
+
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
|
+
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
|
+
struct ggml_context * ctx,
|
1054
|
+
struct ggml_tensor * as[],
|
1055
|
+
struct ggml_tensor * ids,
|
1056
|
+
int id,
|
1057
|
+
struct ggml_tensor * b);
|
1058
|
+
|
1031
1059
|
// A: m columns, n rows,
|
1032
1060
|
// B: p columns, n rows,
|
1033
1061
|
// result is m columns, p rows
|
@@ -1283,6 +1311,14 @@ extern "C" {
|
|
1283
1311
|
struct ggml_context * ctx,
|
1284
1312
|
struct ggml_tensor * a);
|
1285
1313
|
|
1314
|
+
// fused soft_max(a*scale + mask)
|
1315
|
+
// mask is optional
|
1316
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1317
|
+
struct ggml_context * ctx,
|
1318
|
+
struct ggml_tensor * a,
|
1319
|
+
struct ggml_tensor * mask,
|
1320
|
+
float scale);
|
1321
|
+
|
1286
1322
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1323
|
struct ggml_context * ctx,
|
1288
1324
|
struct ggml_tensor * a,
|
@@ -1513,6 +1549,23 @@ extern "C" {
|
|
1513
1549
|
struct ggml_tensor * a,
|
1514
1550
|
int scale_factor);
|
1515
1551
|
|
1552
|
+
// sort rows
|
1553
|
+
enum ggml_sort_order {
|
1554
|
+
GGML_SORT_ASC,
|
1555
|
+
GGML_SORT_DESC,
|
1556
|
+
};
|
1557
|
+
|
1558
|
+
GGML_API struct ggml_tensor * ggml_argsort(
|
1559
|
+
struct ggml_context * ctx,
|
1560
|
+
struct ggml_tensor * a,
|
1561
|
+
enum ggml_sort_order order);
|
1562
|
+
|
1563
|
+
// top k elements per row
|
1564
|
+
GGML_API struct ggml_tensor * ggml_top_k(
|
1565
|
+
struct ggml_context * ctx,
|
1566
|
+
struct ggml_tensor * a,
|
1567
|
+
int k);
|
1568
|
+
|
1516
1569
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1517
1570
|
struct ggml_context * ctx,
|
1518
1571
|
struct ggml_tensor * q,
|
@@ -1574,7 +1627,6 @@ extern "C" {
|
|
1574
1627
|
int kh);
|
1575
1628
|
|
1576
1629
|
// used in sam
|
1577
|
-
|
1578
1630
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1579
1631
|
struct ggml_context * ctx,
|
1580
1632
|
struct ggml_tensor * a,
|
@@ -1749,7 +1801,7 @@ extern "C" {
|
|
1749
1801
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1750
1802
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1751
1803
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1752
|
-
GGML_API struct ggml_cgraph
|
1804
|
+
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
1753
1805
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1754
1806
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1755
1807
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|