llama_cpp 0.15.0 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -326,14 +326,20 @@ extern "C" {
|
|
326
326
|
// get ggml_status name string
|
327
327
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
328
328
|
|
329
|
+
// ieee 754-2008 half-precision float16
|
330
|
+
// todo: make this not an integral type
|
329
331
|
typedef uint16_t ggml_fp16_t;
|
330
|
-
|
331
|
-
|
332
|
-
GGML_API
|
333
|
-
GGML_API
|
334
|
-
|
335
|
-
|
336
|
-
|
332
|
+
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
333
|
+
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
334
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
335
|
+
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
336
|
+
|
337
|
+
// google brain half-precision bfloat16
|
338
|
+
typedef struct { uint16_t bits; } ggml_bf16_t;
|
339
|
+
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
340
|
+
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
341
|
+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
342
|
+
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
337
343
|
|
338
344
|
struct ggml_object;
|
339
345
|
struct ggml_context;
|
@@ -370,6 +376,7 @@ extern "C" {
|
|
370
376
|
GGML_TYPE_I64 = 27,
|
371
377
|
GGML_TYPE_F64 = 28,
|
372
378
|
GGML_TYPE_IQ1_M = 29,
|
379
|
+
GGML_TYPE_BF16 = 30,
|
373
380
|
GGML_TYPE_COUNT,
|
374
381
|
};
|
375
382
|
|
@@ -410,6 +417,7 @@ extern "C" {
|
|
410
417
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
411
418
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
419
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
420
|
+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
413
421
|
};
|
414
422
|
|
415
423
|
// available tensor operations:
|
@@ -460,7 +468,6 @@ extern "C" {
|
|
460
468
|
GGML_OP_SOFT_MAX_BACK,
|
461
469
|
GGML_OP_ROPE,
|
462
470
|
GGML_OP_ROPE_BACK,
|
463
|
-
GGML_OP_ALIBI,
|
464
471
|
GGML_OP_CLAMP,
|
465
472
|
GGML_OP_CONV_TRANSPOSE_1D,
|
466
473
|
GGML_OP_IM2COL,
|
@@ -512,6 +519,7 @@ extern "C" {
|
|
512
519
|
GGML_UNARY_OP_TANH,
|
513
520
|
GGML_UNARY_OP_ELU,
|
514
521
|
GGML_UNARY_OP_RELU,
|
522
|
+
GGML_UNARY_OP_SIGMOID,
|
515
523
|
GGML_UNARY_OP_GELU,
|
516
524
|
GGML_UNARY_OP_GELU_QUICK,
|
517
525
|
GGML_UNARY_OP_SILU,
|
@@ -557,7 +565,8 @@ extern "C" {
|
|
557
565
|
// n-dimensional tensor
|
558
566
|
struct ggml_tensor {
|
559
567
|
enum ggml_type type;
|
560
|
-
|
568
|
+
|
569
|
+
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
561
570
|
|
562
571
|
struct ggml_backend_buffer * buffer;
|
563
572
|
|
@@ -758,7 +767,8 @@ extern "C" {
|
|
758
767
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
759
768
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
760
769
|
|
761
|
-
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
770
|
+
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
771
|
+
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
762
772
|
|
763
773
|
// use this to compute the memory overhead of a tensor
|
764
774
|
GGML_API size_t ggml_tensor_overhead(void);
|
@@ -1066,6 +1076,14 @@ extern "C" {
|
|
1066
1076
|
struct ggml_context * ctx,
|
1067
1077
|
struct ggml_tensor * a);
|
1068
1078
|
|
1079
|
+
GGML_API struct ggml_tensor * ggml_sigmoid(
|
1080
|
+
struct ggml_context * ctx,
|
1081
|
+
struct ggml_tensor * a);
|
1082
|
+
|
1083
|
+
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
1084
|
+
struct ggml_context * ctx,
|
1085
|
+
struct ggml_tensor * a);
|
1086
|
+
|
1069
1087
|
GGML_API struct ggml_tensor * ggml_gelu(
|
1070
1088
|
struct ggml_context * ctx,
|
1071
1089
|
struct ggml_tensor * a);
|
@@ -1420,15 +1438,13 @@ extern "C" {
|
|
1420
1438
|
struct ggml_context * ctx,
|
1421
1439
|
struct ggml_tensor * a);
|
1422
1440
|
|
1423
|
-
// fused soft_max(a*scale + mask
|
1441
|
+
// fused soft_max(a*scale + mask*(ALiBi slope))
|
1424
1442
|
// mask is optional
|
1425
|
-
// pos is required when max_bias > 0.0f
|
1426
1443
|
// max_bias = 0.0f for no ALiBi
|
1427
1444
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1428
1445
|
struct ggml_context * ctx,
|
1429
1446
|
struct ggml_tensor * a,
|
1430
1447
|
struct ggml_tensor * mask,
|
1431
|
-
struct ggml_tensor * pos,
|
1432
1448
|
float scale,
|
1433
1449
|
float max_bias);
|
1434
1450
|
|
@@ -1530,16 +1546,6 @@ extern "C" {
|
|
1530
1546
|
float xpos_base,
|
1531
1547
|
bool xpos_down);
|
1532
1548
|
|
1533
|
-
// alibi position embedding
|
1534
|
-
// in-place, returns view(a)
|
1535
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1536
|
-
struct ggml_context * ctx,
|
1537
|
-
struct ggml_tensor * a,
|
1538
|
-
int n_past,
|
1539
|
-
int n_head,
|
1540
|
-
float bias_max),
|
1541
|
-
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1542
|
-
|
1543
1549
|
// clamp
|
1544
1550
|
// in-place, returns view(a)
|
1545
1551
|
GGML_API struct ggml_tensor * ggml_clamp(
|
@@ -1669,12 +1675,24 @@ extern "C" {
|
|
1669
1675
|
float p1);
|
1670
1676
|
|
1671
1677
|
// nearest interpolate
|
1678
|
+
// multiplies ne0 and ne1 by scale factor
|
1672
1679
|
// used in stable-diffusion
|
1673
1680
|
GGML_API struct ggml_tensor * ggml_upscale(
|
1674
1681
|
struct ggml_context * ctx,
|
1675
1682
|
struct ggml_tensor * a,
|
1676
1683
|
int scale_factor);
|
1677
1684
|
|
1685
|
+
// nearest interpolate
|
1686
|
+
// nearest interpolate to specified dimensions
|
1687
|
+
// used in tortoise.cpp
|
1688
|
+
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
1689
|
+
struct ggml_context * ctx,
|
1690
|
+
struct ggml_tensor * a,
|
1691
|
+
int ne0,
|
1692
|
+
int ne1,
|
1693
|
+
int ne2,
|
1694
|
+
int ne3);
|
1695
|
+
|
1678
1696
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1679
1697
|
GGML_API struct ggml_tensor * ggml_pad(
|
1680
1698
|
struct ggml_context * ctx,
|
@@ -1736,7 +1754,8 @@ extern "C" {
|
|
1736
1754
|
struct ggml_tensor * k,
|
1737
1755
|
struct ggml_tensor * v,
|
1738
1756
|
struct ggml_tensor * mask,
|
1739
|
-
float scale
|
1757
|
+
float scale,
|
1758
|
+
float max_bias);
|
1740
1759
|
|
1741
1760
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
1742
1761
|
struct ggml_tensor * a,
|