@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -468,7 +468,6 @@ extern "C" {
468
468
  GGML_OP_SOFT_MAX_BACK,
469
469
  GGML_OP_ROPE,
470
470
  GGML_OP_ROPE_BACK,
471
- GGML_OP_ALIBI,
472
471
  GGML_OP_CLAMP,
473
472
  GGML_OP_CONV_TRANSPOSE_1D,
474
473
  GGML_OP_IM2COL,
@@ -482,9 +481,7 @@ extern "C" {
482
481
  GGML_OP_ARGSORT,
483
482
  GGML_OP_LEAKY_RELU,
484
483
 
485
- GGML_OP_FLASH_ATTN,
486
484
  GGML_OP_FLASH_ATTN_EXT,
487
- GGML_OP_FLASH_FF,
488
485
  GGML_OP_FLASH_ATTN_BACK,
489
486
  GGML_OP_SSM_CONV,
490
487
  GGML_OP_SSM_SCAN,
@@ -520,6 +517,7 @@ extern "C" {
520
517
  GGML_UNARY_OP_TANH,
521
518
  GGML_UNARY_OP_ELU,
522
519
  GGML_UNARY_OP_RELU,
520
+ GGML_UNARY_OP_SIGMOID,
523
521
  GGML_UNARY_OP_GELU,
524
522
  GGML_UNARY_OP_GELU_QUICK,
525
523
  GGML_UNARY_OP_SILU,
@@ -565,7 +563,8 @@ extern "C" {
565
563
  // n-dimensional tensor
566
564
  struct ggml_tensor {
567
565
  enum ggml_type type;
568
- enum ggml_backend_type backend;
566
+
567
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
569
568
 
570
569
  struct ggml_backend_buffer * buffer;
571
570
 
@@ -766,7 +765,8 @@ extern "C" {
766
765
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
767
766
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
768
767
 
769
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
768
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
770
 
771
771
  // use this to compute the memory overhead of a tensor
772
772
  GGML_API size_t ggml_tensor_overhead(void);
@@ -1007,12 +1007,13 @@ extern "C" {
1007
1007
  struct ggml_tensor * a,
1008
1008
  struct ggml_tensor * b);
1009
1009
 
1010
- // concat a and b on dim 2
1010
+ // concat a and b along dim
1011
1011
  // used in stable-diffusion
1012
1012
  GGML_API struct ggml_tensor * ggml_concat(
1013
1013
  struct ggml_context * ctx,
1014
1014
  struct ggml_tensor * a,
1015
- struct ggml_tensor * b);
1015
+ struct ggml_tensor * b,
1016
+ int dim);
1016
1017
 
1017
1018
  GGML_API struct ggml_tensor * ggml_abs(
1018
1019
  struct ggml_context * ctx,
@@ -1074,6 +1075,14 @@ extern "C" {
1074
1075
  struct ggml_context * ctx,
1075
1076
  struct ggml_tensor * a);
1076
1077
 
1078
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1079
+ struct ggml_context * ctx,
1080
+ struct ggml_tensor * a);
1081
+
1082
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1083
+ struct ggml_context * ctx,
1084
+ struct ggml_tensor * a);
1085
+
1077
1086
  GGML_API struct ggml_tensor * ggml_gelu(
1078
1087
  struct ggml_context * ctx,
1079
1088
  struct ggml_tensor * a);
@@ -1428,15 +1437,13 @@ extern "C" {
1428
1437
  struct ggml_context * ctx,
1429
1438
  struct ggml_tensor * a);
1430
1439
 
1431
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1440
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1432
1441
  // mask is optional
1433
- // pos is required when max_bias > 0.0f
1434
1442
  // max_bias = 0.0f for no ALiBi
1435
1443
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1436
1444
  struct ggml_context * ctx,
1437
1445
  struct ggml_tensor * a,
1438
1446
  struct ggml_tensor * mask,
1439
- struct ggml_tensor * pos,
1440
1447
  float scale,
1441
1448
  float max_bias);
1442
1449
 
@@ -1452,11 +1459,12 @@ extern "C" {
1452
1459
  struct ggml_tensor * b);
1453
1460
 
1454
1461
  // rotary position embedding
1455
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1462
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1456
1463
  // if mode & 2 == 1, GPT-NeoX style
1457
1464
  // if mode & 4 == 1, ChatGLM style
1458
1465
  //
1459
1466
  // b is an int32 vector with size a->ne[2], it contains the positions
1467
+ // c is freq factors (e.g. phi3-128k), (optional)
1460
1468
  GGML_API struct ggml_tensor * ggml_rope(
1461
1469
  struct ggml_context * ctx,
1462
1470
  struct ggml_tensor * a,
@@ -1475,10 +1483,11 @@ extern "C" {
1475
1483
  int n_ctx);
1476
1484
 
1477
1485
  // custom RoPE
1478
- GGML_API struct ggml_tensor * ggml_rope_custom(
1486
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1479
1487
  struct ggml_context * ctx,
1480
1488
  struct ggml_tensor * a,
1481
1489
  struct ggml_tensor * b,
1490
+ struct ggml_tensor * c,
1482
1491
  int n_dims,
1483
1492
  int mode,
1484
1493
  int n_ctx,
@@ -1491,10 +1500,11 @@ extern "C" {
1491
1500
  float beta_slow);
1492
1501
 
1493
1502
  // in-place, returns view(a)
1494
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1503
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1495
1504
  struct ggml_context * ctx,
1496
1505
  struct ggml_tensor * a,
1497
1506
  struct ggml_tensor * b,
1507
+ struct ggml_tensor * c,
1498
1508
  int n_dims,
1499
1509
  int mode,
1500
1510
  int n_ctx,
@@ -1506,18 +1516,41 @@ extern "C" {
1506
1516
  float beta_fast,
1507
1517
  float beta_slow);
1508
1518
 
1509
- // compute correction dims for YaRN RoPE scaling
1510
- GGML_CALL void ggml_rope_yarn_corr_dims(
1511
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1519
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1520
+ struct ggml_context * ctx,
1521
+ struct ggml_tensor * a,
1522
+ struct ggml_tensor * b,
1523
+ int n_dims,
1524
+ int mode,
1525
+ int n_ctx,
1526
+ int n_orig_ctx,
1527
+ float freq_base,
1528
+ float freq_scale,
1529
+ float ext_factor,
1530
+ float attn_factor,
1531
+ float beta_fast,
1532
+ float beta_slow),
1533
+ "use ggml_rope_ext instead");
1512
1534
 
1513
- // xPos RoPE, in-place, returns view(a)
1514
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1535
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1515
1536
  struct ggml_context * ctx,
1516
1537
  struct ggml_tensor * a,
1517
1538
  struct ggml_tensor * b,
1518
1539
  int n_dims,
1519
- float base,
1520
- bool down);
1540
+ int mode,
1541
+ int n_ctx,
1542
+ int n_orig_ctx,
1543
+ float freq_base,
1544
+ float freq_scale,
1545
+ float ext_factor,
1546
+ float attn_factor,
1547
+ float beta_fast,
1548
+ float beta_slow),
1549
+ "use ggml_rope_ext_inplace instead");
1550
+
1551
+ // compute correction dims for YaRN RoPE scaling
1552
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1553
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1521
1554
 
1522
1555
  // rotary position embedding backward, i.e compute dx from dy
1523
1556
  // a - dy
@@ -1525,6 +1558,7 @@ extern "C" {
1525
1558
  struct ggml_context * ctx,
1526
1559
  struct ggml_tensor * a,
1527
1560
  struct ggml_tensor * b,
1561
+ struct ggml_tensor * c,
1528
1562
  int n_dims,
1529
1563
  int mode,
1530
1564
  int n_ctx,
@@ -1538,16 +1572,6 @@ extern "C" {
1538
1572
  float xpos_base,
1539
1573
  bool xpos_down);
1540
1574
 
1541
- // alibi position embedding
1542
- // in-place, returns view(a)
1543
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1544
- struct ggml_context * ctx,
1545
- struct ggml_tensor * a,
1546
- int n_past,
1547
- int n_head,
1548
- float bias_max),
1549
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1550
-
1551
1575
  // clamp
1552
1576
  // in-place, returns view(a)
1553
1577
  GGML_API struct ggml_tensor * ggml_clamp(
@@ -1677,12 +1701,24 @@ extern "C" {
1677
1701
  float p1);
1678
1702
 
1679
1703
  // nearest interpolate
1704
+ // multiplies ne0 and ne1 by scale factor
1680
1705
  // used in stable-diffusion
1681
1706
  GGML_API struct ggml_tensor * ggml_upscale(
1682
1707
  struct ggml_context * ctx,
1683
1708
  struct ggml_tensor * a,
1684
1709
  int scale_factor);
1685
1710
 
1711
+ // nearest interpolate
1712
+ // nearest interpolate to specified dimensions
1713
+ // used in tortoise.cpp
1714
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1715
+ struct ggml_context * ctx,
1716
+ struct ggml_tensor * a,
1717
+ int ne0,
1718
+ int ne1,
1719
+ int ne2,
1720
+ int ne3);
1721
+
1686
1722
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1687
1723
  GGML_API struct ggml_tensor * ggml_pad(
1688
1724
  struct ggml_context * ctx,
@@ -1724,13 +1760,6 @@ extern "C" {
1724
1760
  struct ggml_tensor * a,
1725
1761
  int k);
1726
1762
 
1727
- GGML_API struct ggml_tensor * ggml_flash_attn(
1728
- struct ggml_context * ctx,
1729
- struct ggml_tensor * q,
1730
- struct ggml_tensor * k,
1731
- struct ggml_tensor * v,
1732
- bool masked);
1733
-
1734
1763
  #define GGML_KQ_MASK_PAD 32
1735
1764
 
1736
1765
  // q: [n_embd, n_batch, n_head, 1]
@@ -1744,12 +1773,14 @@ extern "C" {
1744
1773
  struct ggml_tensor * k,
1745
1774
  struct ggml_tensor * v,
1746
1775
  struct ggml_tensor * mask,
1747
- float scale);
1776
+ float scale,
1777
+ float max_bias);
1748
1778
 
1749
1779
  GGML_API void ggml_flash_attn_ext_set_prec(
1750
1780
  struct ggml_tensor * a,
1751
1781
  enum ggml_prec prec);
1752
1782
 
1783
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1753
1784
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1754
1785
  struct ggml_context * ctx,
1755
1786
  struct ggml_tensor * q,
@@ -1758,14 +1789,6 @@ extern "C" {
1758
1789
  struct ggml_tensor * d,
1759
1790
  bool masked);
1760
1791
 
1761
- GGML_API struct ggml_tensor * ggml_flash_ff(
1762
- struct ggml_context * ctx,
1763
- struct ggml_tensor * a,
1764
- struct ggml_tensor * b0,
1765
- struct ggml_tensor * b1,
1766
- struct ggml_tensor * c0,
1767
- struct ggml_tensor * c1);
1768
-
1769
1792
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1770
1793
  struct ggml_context * ctx,
1771
1794
  struct ggml_tensor * s,
@@ -2379,8 +2402,10 @@ extern "C" {
2379
2402
  GGML_API int ggml_cpu_has_avx512 (void);
2380
2403
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2381
2404
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2405
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2382
2406
  GGML_API int ggml_cpu_has_fma (void);
2383
2407
  GGML_API int ggml_cpu_has_neon (void);
2408
+ GGML_API int ggml_cpu_has_sve (void);
2384
2409
  GGML_API int ggml_cpu_has_arm_fma (void);
2385
2410
  GGML_API int ggml_cpu_has_metal (void);
2386
2411
  GGML_API int ggml_cpu_has_f16c (void);