llama_cpp 0.15.0 → 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -326,14 +326,20 @@ extern "C" {
|
|
326
326
|
// get ggml_status name string
|
327
327
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
328
328
|
|
329
|
+
// ieee 754-2008 half-precision float16
|
330
|
+
// todo: make this not an integral type
|
329
331
|
typedef uint16_t ggml_fp16_t;
|
330
|
-
|
331
|
-
|
332
|
-
GGML_API
|
333
|
-
GGML_API
|
334
|
-
|
335
|
-
|
336
|
-
|
332
|
+
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
333
|
+
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
334
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
335
|
+
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
336
|
+
|
337
|
+
// google brain half-precision bfloat16
|
338
|
+
typedef struct { uint16_t bits; } ggml_bf16_t;
|
339
|
+
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
340
|
+
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
341
|
+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
342
|
+
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
337
343
|
|
338
344
|
struct ggml_object;
|
339
345
|
struct ggml_context;
|
@@ -370,6 +376,7 @@ extern "C" {
|
|
370
376
|
GGML_TYPE_I64 = 27,
|
371
377
|
GGML_TYPE_F64 = 28,
|
372
378
|
GGML_TYPE_IQ1_M = 29,
|
379
|
+
GGML_TYPE_BF16 = 30,
|
373
380
|
GGML_TYPE_COUNT,
|
374
381
|
};
|
375
382
|
|
@@ -410,6 +417,7 @@ extern "C" {
|
|
410
417
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
411
418
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
419
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
420
|
+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
413
421
|
};
|
414
422
|
|
415
423
|
// available tensor operations:
|
@@ -460,7 +468,6 @@ extern "C" {
|
|
460
468
|
GGML_OP_SOFT_MAX_BACK,
|
461
469
|
GGML_OP_ROPE,
|
462
470
|
GGML_OP_ROPE_BACK,
|
463
|
-
GGML_OP_ALIBI,
|
464
471
|
GGML_OP_CLAMP,
|
465
472
|
GGML_OP_CONV_TRANSPOSE_1D,
|
466
473
|
GGML_OP_IM2COL,
|
@@ -512,6 +519,7 @@ extern "C" {
|
|
512
519
|
GGML_UNARY_OP_TANH,
|
513
520
|
GGML_UNARY_OP_ELU,
|
514
521
|
GGML_UNARY_OP_RELU,
|
522
|
+
GGML_UNARY_OP_SIGMOID,
|
515
523
|
GGML_UNARY_OP_GELU,
|
516
524
|
GGML_UNARY_OP_GELU_QUICK,
|
517
525
|
GGML_UNARY_OP_SILU,
|
@@ -557,7 +565,8 @@ extern "C" {
|
|
557
565
|
// n-dimensional tensor
|
558
566
|
struct ggml_tensor {
|
559
567
|
enum ggml_type type;
|
560
|
-
|
568
|
+
|
569
|
+
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
561
570
|
|
562
571
|
struct ggml_backend_buffer * buffer;
|
563
572
|
|
@@ -758,7 +767,8 @@ extern "C" {
|
|
758
767
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
759
768
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
760
769
|
|
761
|
-
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
770
|
+
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
771
|
+
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
762
772
|
|
763
773
|
// use this to compute the memory overhead of a tensor
|
764
774
|
GGML_API size_t ggml_tensor_overhead(void);
|
@@ -1066,6 +1076,14 @@ extern "C" {
|
|
1066
1076
|
struct ggml_context * ctx,
|
1067
1077
|
struct ggml_tensor * a);
|
1068
1078
|
|
1079
|
+
GGML_API struct ggml_tensor * ggml_sigmoid(
|
1080
|
+
struct ggml_context * ctx,
|
1081
|
+
struct ggml_tensor * a);
|
1082
|
+
|
1083
|
+
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
1084
|
+
struct ggml_context * ctx,
|
1085
|
+
struct ggml_tensor * a);
|
1086
|
+
|
1069
1087
|
GGML_API struct ggml_tensor * ggml_gelu(
|
1070
1088
|
struct ggml_context * ctx,
|
1071
1089
|
struct ggml_tensor * a);
|
@@ -1420,15 +1438,13 @@ extern "C" {
|
|
1420
1438
|
struct ggml_context * ctx,
|
1421
1439
|
struct ggml_tensor * a);
|
1422
1440
|
|
1423
|
-
// fused soft_max(a*scale + mask
|
1441
|
+
// fused soft_max(a*scale + mask*(ALiBi slope))
|
1424
1442
|
// mask is optional
|
1425
|
-
// pos is required when max_bias > 0.0f
|
1426
1443
|
// max_bias = 0.0f for no ALiBi
|
1427
1444
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1428
1445
|
struct ggml_context * ctx,
|
1429
1446
|
struct ggml_tensor * a,
|
1430
1447
|
struct ggml_tensor * mask,
|
1431
|
-
struct ggml_tensor * pos,
|
1432
1448
|
float scale,
|
1433
1449
|
float max_bias);
|
1434
1450
|
|
@@ -1530,16 +1546,6 @@ extern "C" {
|
|
1530
1546
|
float xpos_base,
|
1531
1547
|
bool xpos_down);
|
1532
1548
|
|
1533
|
-
// alibi position embedding
|
1534
|
-
// in-place, returns view(a)
|
1535
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1536
|
-
struct ggml_context * ctx,
|
1537
|
-
struct ggml_tensor * a,
|
1538
|
-
int n_past,
|
1539
|
-
int n_head,
|
1540
|
-
float bias_max),
|
1541
|
-
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1542
|
-
|
1543
1549
|
// clamp
|
1544
1550
|
// in-place, returns view(a)
|
1545
1551
|
GGML_API struct ggml_tensor * ggml_clamp(
|
@@ -1669,12 +1675,24 @@ extern "C" {
|
|
1669
1675
|
float p1);
|
1670
1676
|
|
1671
1677
|
// nearest interpolate
|
1678
|
+
// multiplies ne0 and ne1 by scale factor
|
1672
1679
|
// used in stable-diffusion
|
1673
1680
|
GGML_API struct ggml_tensor * ggml_upscale(
|
1674
1681
|
struct ggml_context * ctx,
|
1675
1682
|
struct ggml_tensor * a,
|
1676
1683
|
int scale_factor);
|
1677
1684
|
|
1685
|
+
// nearest interpolate
|
1686
|
+
// nearest interpolate to specified dimensions
|
1687
|
+
// used in tortoise.cpp
|
1688
|
+
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
1689
|
+
struct ggml_context * ctx,
|
1690
|
+
struct ggml_tensor * a,
|
1691
|
+
int ne0,
|
1692
|
+
int ne1,
|
1693
|
+
int ne2,
|
1694
|
+
int ne3);
|
1695
|
+
|
1678
1696
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1679
1697
|
GGML_API struct ggml_tensor * ggml_pad(
|
1680
1698
|
struct ggml_context * ctx,
|
@@ -1736,7 +1754,8 @@ extern "C" {
|
|
1736
1754
|
struct ggml_tensor * k,
|
1737
1755
|
struct ggml_tensor * v,
|
1738
1756
|
struct ggml_tensor * mask,
|
1739
|
-
float scale
|
1757
|
+
float scale,
|
1758
|
+
float max_bias);
|
1740
1759
|
|
1741
1760
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
1742
1761
|
struct ggml_tensor * a,
|