@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
package/src/llama.cpp/ggml.h
CHANGED
|
@@ -468,7 +468,6 @@ extern "C" {
|
|
|
468
468
|
GGML_OP_SOFT_MAX_BACK,
|
|
469
469
|
GGML_OP_ROPE,
|
|
470
470
|
GGML_OP_ROPE_BACK,
|
|
471
|
-
GGML_OP_ALIBI,
|
|
472
471
|
GGML_OP_CLAMP,
|
|
473
472
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
474
473
|
GGML_OP_IM2COL,
|
|
@@ -482,9 +481,7 @@ extern "C" {
|
|
|
482
481
|
GGML_OP_ARGSORT,
|
|
483
482
|
GGML_OP_LEAKY_RELU,
|
|
484
483
|
|
|
485
|
-
GGML_OP_FLASH_ATTN,
|
|
486
484
|
GGML_OP_FLASH_ATTN_EXT,
|
|
487
|
-
GGML_OP_FLASH_FF,
|
|
488
485
|
GGML_OP_FLASH_ATTN_BACK,
|
|
489
486
|
GGML_OP_SSM_CONV,
|
|
490
487
|
GGML_OP_SSM_SCAN,
|
|
@@ -520,6 +517,7 @@ extern "C" {
|
|
|
520
517
|
GGML_UNARY_OP_TANH,
|
|
521
518
|
GGML_UNARY_OP_ELU,
|
|
522
519
|
GGML_UNARY_OP_RELU,
|
|
520
|
+
GGML_UNARY_OP_SIGMOID,
|
|
523
521
|
GGML_UNARY_OP_GELU,
|
|
524
522
|
GGML_UNARY_OP_GELU_QUICK,
|
|
525
523
|
GGML_UNARY_OP_SILU,
|
|
@@ -565,7 +563,8 @@ extern "C" {
|
|
|
565
563
|
// n-dimensional tensor
|
|
566
564
|
struct ggml_tensor {
|
|
567
565
|
enum ggml_type type;
|
|
568
|
-
|
|
566
|
+
|
|
567
|
+
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
|
569
568
|
|
|
570
569
|
struct ggml_backend_buffer * buffer;
|
|
571
570
|
|
|
@@ -766,7 +765,8 @@ extern "C" {
|
|
|
766
765
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
|
767
766
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
|
768
767
|
|
|
769
|
-
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
768
|
+
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
769
|
+
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
770
770
|
|
|
771
771
|
// use this to compute the memory overhead of a tensor
|
|
772
772
|
GGML_API size_t ggml_tensor_overhead(void);
|
|
@@ -1007,12 +1007,13 @@ extern "C" {
|
|
|
1007
1007
|
struct ggml_tensor * a,
|
|
1008
1008
|
struct ggml_tensor * b);
|
|
1009
1009
|
|
|
1010
|
-
// concat a and b
|
|
1010
|
+
// concat a and b along dim
|
|
1011
1011
|
// used in stable-diffusion
|
|
1012
1012
|
GGML_API struct ggml_tensor * ggml_concat(
|
|
1013
1013
|
struct ggml_context * ctx,
|
|
1014
1014
|
struct ggml_tensor * a,
|
|
1015
|
-
struct ggml_tensor * b
|
|
1015
|
+
struct ggml_tensor * b,
|
|
1016
|
+
int dim);
|
|
1016
1017
|
|
|
1017
1018
|
GGML_API struct ggml_tensor * ggml_abs(
|
|
1018
1019
|
struct ggml_context * ctx,
|
|
@@ -1074,6 +1075,14 @@ extern "C" {
|
|
|
1074
1075
|
struct ggml_context * ctx,
|
|
1075
1076
|
struct ggml_tensor * a);
|
|
1076
1077
|
|
|
1078
|
+
GGML_API struct ggml_tensor * ggml_sigmoid(
|
|
1079
|
+
struct ggml_context * ctx,
|
|
1080
|
+
struct ggml_tensor * a);
|
|
1081
|
+
|
|
1082
|
+
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
|
1083
|
+
struct ggml_context * ctx,
|
|
1084
|
+
struct ggml_tensor * a);
|
|
1085
|
+
|
|
1077
1086
|
GGML_API struct ggml_tensor * ggml_gelu(
|
|
1078
1087
|
struct ggml_context * ctx,
|
|
1079
1088
|
struct ggml_tensor * a);
|
|
@@ -1428,15 +1437,13 @@ extern "C" {
|
|
|
1428
1437
|
struct ggml_context * ctx,
|
|
1429
1438
|
struct ggml_tensor * a);
|
|
1430
1439
|
|
|
1431
|
-
// fused soft_max(a*scale + mask
|
|
1440
|
+
// fused soft_max(a*scale + mask*(ALiBi slope))
|
|
1432
1441
|
// mask is optional
|
|
1433
|
-
// pos is required when max_bias > 0.0f
|
|
1434
1442
|
// max_bias = 0.0f for no ALiBi
|
|
1435
1443
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
|
1436
1444
|
struct ggml_context * ctx,
|
|
1437
1445
|
struct ggml_tensor * a,
|
|
1438
1446
|
struct ggml_tensor * mask,
|
|
1439
|
-
struct ggml_tensor * pos,
|
|
1440
1447
|
float scale,
|
|
1441
1448
|
float max_bias);
|
|
1442
1449
|
|
|
@@ -1452,11 +1459,12 @@ extern "C" {
|
|
|
1452
1459
|
struct ggml_tensor * b);
|
|
1453
1460
|
|
|
1454
1461
|
// rotary position embedding
|
|
1455
|
-
// if mode & 1 == 1, skip n_past elements (
|
|
1462
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
|
1456
1463
|
// if mode & 2 == 1, GPT-NeoX style
|
|
1457
1464
|
// if mode & 4 == 1, ChatGLM style
|
|
1458
1465
|
//
|
|
1459
1466
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
|
1467
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
|
1460
1468
|
GGML_API struct ggml_tensor * ggml_rope(
|
|
1461
1469
|
struct ggml_context * ctx,
|
|
1462
1470
|
struct ggml_tensor * a,
|
|
@@ -1475,10 +1483,11 @@ extern "C" {
|
|
|
1475
1483
|
int n_ctx);
|
|
1476
1484
|
|
|
1477
1485
|
// custom RoPE
|
|
1478
|
-
GGML_API struct ggml_tensor *
|
|
1486
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
|
1479
1487
|
struct ggml_context * ctx,
|
|
1480
1488
|
struct ggml_tensor * a,
|
|
1481
1489
|
struct ggml_tensor * b,
|
|
1490
|
+
struct ggml_tensor * c,
|
|
1482
1491
|
int n_dims,
|
|
1483
1492
|
int mode,
|
|
1484
1493
|
int n_ctx,
|
|
@@ -1491,10 +1500,11 @@ extern "C" {
|
|
|
1491
1500
|
float beta_slow);
|
|
1492
1501
|
|
|
1493
1502
|
// in-place, returns view(a)
|
|
1494
|
-
GGML_API struct ggml_tensor *
|
|
1503
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
|
1495
1504
|
struct ggml_context * ctx,
|
|
1496
1505
|
struct ggml_tensor * a,
|
|
1497
1506
|
struct ggml_tensor * b,
|
|
1507
|
+
struct ggml_tensor * c,
|
|
1498
1508
|
int n_dims,
|
|
1499
1509
|
int mode,
|
|
1500
1510
|
int n_ctx,
|
|
@@ -1506,18 +1516,41 @@ extern "C" {
|
|
|
1506
1516
|
float beta_fast,
|
|
1507
1517
|
float beta_slow);
|
|
1508
1518
|
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1519
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
|
1520
|
+
struct ggml_context * ctx,
|
|
1521
|
+
struct ggml_tensor * a,
|
|
1522
|
+
struct ggml_tensor * b,
|
|
1523
|
+
int n_dims,
|
|
1524
|
+
int mode,
|
|
1525
|
+
int n_ctx,
|
|
1526
|
+
int n_orig_ctx,
|
|
1527
|
+
float freq_base,
|
|
1528
|
+
float freq_scale,
|
|
1529
|
+
float ext_factor,
|
|
1530
|
+
float attn_factor,
|
|
1531
|
+
float beta_fast,
|
|
1532
|
+
float beta_slow),
|
|
1533
|
+
"use ggml_rope_ext instead");
|
|
1512
1534
|
|
|
1513
|
-
|
|
1514
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
|
1535
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
|
1515
1536
|
struct ggml_context * ctx,
|
|
1516
1537
|
struct ggml_tensor * a,
|
|
1517
1538
|
struct ggml_tensor * b,
|
|
1518
1539
|
int n_dims,
|
|
1519
|
-
|
|
1520
|
-
|
|
1540
|
+
int mode,
|
|
1541
|
+
int n_ctx,
|
|
1542
|
+
int n_orig_ctx,
|
|
1543
|
+
float freq_base,
|
|
1544
|
+
float freq_scale,
|
|
1545
|
+
float ext_factor,
|
|
1546
|
+
float attn_factor,
|
|
1547
|
+
float beta_fast,
|
|
1548
|
+
float beta_slow),
|
|
1549
|
+
"use ggml_rope_ext_inplace instead");
|
|
1550
|
+
|
|
1551
|
+
// compute correction dims for YaRN RoPE scaling
|
|
1552
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
1553
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
|
1521
1554
|
|
|
1522
1555
|
// rotary position embedding backward, i.e compute dx from dy
|
|
1523
1556
|
// a - dy
|
|
@@ -1525,6 +1558,7 @@ extern "C" {
|
|
|
1525
1558
|
struct ggml_context * ctx,
|
|
1526
1559
|
struct ggml_tensor * a,
|
|
1527
1560
|
struct ggml_tensor * b,
|
|
1561
|
+
struct ggml_tensor * c,
|
|
1528
1562
|
int n_dims,
|
|
1529
1563
|
int mode,
|
|
1530
1564
|
int n_ctx,
|
|
@@ -1538,16 +1572,6 @@ extern "C" {
|
|
|
1538
1572
|
float xpos_base,
|
|
1539
1573
|
bool xpos_down);
|
|
1540
1574
|
|
|
1541
|
-
// alibi position embedding
|
|
1542
|
-
// in-place, returns view(a)
|
|
1543
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
|
1544
|
-
struct ggml_context * ctx,
|
|
1545
|
-
struct ggml_tensor * a,
|
|
1546
|
-
int n_past,
|
|
1547
|
-
int n_head,
|
|
1548
|
-
float bias_max),
|
|
1549
|
-
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
|
1550
|
-
|
|
1551
1575
|
// clamp
|
|
1552
1576
|
// in-place, returns view(a)
|
|
1553
1577
|
GGML_API struct ggml_tensor * ggml_clamp(
|
|
@@ -1677,12 +1701,24 @@ extern "C" {
|
|
|
1677
1701
|
float p1);
|
|
1678
1702
|
|
|
1679
1703
|
// nearest interpolate
|
|
1704
|
+
// multiplies ne0 and ne1 by scale factor
|
|
1680
1705
|
// used in stable-diffusion
|
|
1681
1706
|
GGML_API struct ggml_tensor * ggml_upscale(
|
|
1682
1707
|
struct ggml_context * ctx,
|
|
1683
1708
|
struct ggml_tensor * a,
|
|
1684
1709
|
int scale_factor);
|
|
1685
1710
|
|
|
1711
|
+
// nearest interpolate
|
|
1712
|
+
// nearest interpolate to specified dimensions
|
|
1713
|
+
// used in tortoise.cpp
|
|
1714
|
+
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
|
1715
|
+
struct ggml_context * ctx,
|
|
1716
|
+
struct ggml_tensor * a,
|
|
1717
|
+
int ne0,
|
|
1718
|
+
int ne1,
|
|
1719
|
+
int ne2,
|
|
1720
|
+
int ne3);
|
|
1721
|
+
|
|
1686
1722
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
|
1687
1723
|
GGML_API struct ggml_tensor * ggml_pad(
|
|
1688
1724
|
struct ggml_context * ctx,
|
|
@@ -1724,13 +1760,6 @@ extern "C" {
|
|
|
1724
1760
|
struct ggml_tensor * a,
|
|
1725
1761
|
int k);
|
|
1726
1762
|
|
|
1727
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
|
1728
|
-
struct ggml_context * ctx,
|
|
1729
|
-
struct ggml_tensor * q,
|
|
1730
|
-
struct ggml_tensor * k,
|
|
1731
|
-
struct ggml_tensor * v,
|
|
1732
|
-
bool masked);
|
|
1733
|
-
|
|
1734
1763
|
#define GGML_KQ_MASK_PAD 32
|
|
1735
1764
|
|
|
1736
1765
|
// q: [n_embd, n_batch, n_head, 1]
|
|
@@ -1744,12 +1773,14 @@ extern "C" {
|
|
|
1744
1773
|
struct ggml_tensor * k,
|
|
1745
1774
|
struct ggml_tensor * v,
|
|
1746
1775
|
struct ggml_tensor * mask,
|
|
1747
|
-
float scale
|
|
1776
|
+
float scale,
|
|
1777
|
+
float max_bias);
|
|
1748
1778
|
|
|
1749
1779
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
|
1750
1780
|
struct ggml_tensor * a,
|
|
1751
1781
|
enum ggml_prec prec);
|
|
1752
1782
|
|
|
1783
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
|
1753
1784
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
|
1754
1785
|
struct ggml_context * ctx,
|
|
1755
1786
|
struct ggml_tensor * q,
|
|
@@ -1758,14 +1789,6 @@ extern "C" {
|
|
|
1758
1789
|
struct ggml_tensor * d,
|
|
1759
1790
|
bool masked);
|
|
1760
1791
|
|
|
1761
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
|
1762
|
-
struct ggml_context * ctx,
|
|
1763
|
-
struct ggml_tensor * a,
|
|
1764
|
-
struct ggml_tensor * b0,
|
|
1765
|
-
struct ggml_tensor * b1,
|
|
1766
|
-
struct ggml_tensor * c0,
|
|
1767
|
-
struct ggml_tensor * c1);
|
|
1768
|
-
|
|
1769
1792
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
|
1770
1793
|
struct ggml_context * ctx,
|
|
1771
1794
|
struct ggml_tensor * s,
|
|
@@ -2379,8 +2402,10 @@ extern "C" {
|
|
|
2379
2402
|
GGML_API int ggml_cpu_has_avx512 (void);
|
|
2380
2403
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
|
2381
2404
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
|
2405
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
|
2382
2406
|
GGML_API int ggml_cpu_has_fma (void);
|
|
2383
2407
|
GGML_API int ggml_cpu_has_neon (void);
|
|
2408
|
+
GGML_API int ggml_cpu_has_sve (void);
|
|
2384
2409
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
|
2385
2410
|
GGML_API int ggml_cpu_has_metal (void);
|
|
2386
2411
|
GGML_API int ggml_cpu_has_f16c (void);
|