@fugood/llama.node 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/llama.cpp/CMakeLists.txt +72 -46
  20. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  21. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  22. package/src/llama.cpp/common/common.cpp +732 -752
  23. package/src/llama.cpp/common/common.h +47 -41
  24. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  25. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  26. package/src/llama.cpp/common/log.h +5 -5
  27. package/src/llama.cpp/common/sampling.cpp +89 -7
  28. package/src/llama.cpp/common/sampling.h +5 -0
  29. package/src/llama.cpp/common/train.cpp +2 -2
  30. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  31. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  32. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  33. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  34. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  35. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  36. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  37. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  38. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  39. package/src/llama.cpp/examples/llava/clip.h +1 -1
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  41. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  42. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  43. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  44. package/src/llama.cpp/examples/main/main.cpp +24 -16
  45. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  46. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  47. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  48. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  49. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  50. package/src/llama.cpp/examples/server/server.cpp +21 -9
  51. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  52. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  53. package/src/llama.cpp/ggml-backend.c +0 -1
  54. package/src/llama.cpp/ggml-common.h +0 -54
  55. package/src/llama.cpp/ggml-cuda.h +1 -0
  56. package/src/llama.cpp/ggml-impl.h +51 -0
  57. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  58. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  59. package/src/llama.cpp/ggml-quants.c +3700 -2041
  60. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  61. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  62. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  63. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  64. package/src/llama.cpp/ggml.c +1034 -1154
  65. package/src/llama.cpp/ggml.h +59 -31
  66. package/src/llama.cpp/llama.cpp +859 -609
  67. package/src/llama.cpp/llama.h +19 -6
  68. package/src/llama.cpp/requirements.txt +0 -1
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  70. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  71. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  72. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  73. package/src/llama.cpp/unicode-data.h +15 -12
  74. package/src/llama.cpp/unicode.cpp +89 -111
  75. package/src/llama.cpp/unicode.h +44 -12
  76. package/src/llama.cpp/build.zig +0 -172
  77. package/src/llama.cpp/ggml-mpi.c +0 -216
  78. package/src/llama.cpp/ggml-mpi.h +0 -39
  79. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -481,9 +481,7 @@ extern "C" {
481
481
  GGML_OP_ARGSORT,
482
482
  GGML_OP_LEAKY_RELU,
483
483
 
484
- GGML_OP_FLASH_ATTN,
485
484
  GGML_OP_FLASH_ATTN_EXT,
486
- GGML_OP_FLASH_FF,
487
485
  GGML_OP_FLASH_ATTN_BACK,
488
486
  GGML_OP_SSM_CONV,
489
487
  GGML_OP_SSM_SCAN,
@@ -565,7 +563,8 @@ extern "C" {
565
563
  // n-dimensional tensor
566
564
  struct ggml_tensor {
567
565
  enum ggml_type type;
568
- enum ggml_backend_type backend;
566
+
567
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
569
568
 
570
569
  struct ggml_backend_buffer * buffer;
571
570
 
@@ -766,7 +765,8 @@ extern "C" {
766
765
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
767
766
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
768
767
 
769
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
768
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
770
 
771
771
  // use this to compute the memory overhead of a tensor
772
772
  GGML_API size_t ggml_tensor_overhead(void);
@@ -1007,12 +1007,13 @@ extern "C" {
1007
1007
  struct ggml_tensor * a,
1008
1008
  struct ggml_tensor * b);
1009
1009
 
1010
- // concat a and b on dim 2
1010
+ // concat a and b along dim
1011
1011
  // used in stable-diffusion
1012
1012
  GGML_API struct ggml_tensor * ggml_concat(
1013
1013
  struct ggml_context * ctx,
1014
1014
  struct ggml_tensor * a,
1015
- struct ggml_tensor * b);
1015
+ struct ggml_tensor * b,
1016
+ int dim);
1016
1017
 
1017
1018
  GGML_API struct ggml_tensor * ggml_abs(
1018
1019
  struct ggml_context * ctx,
@@ -1458,11 +1459,12 @@ extern "C" {
1458
1459
  struct ggml_tensor * b);
1459
1460
 
1460
1461
  // rotary position embedding
1461
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1462
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1462
1463
  // if mode & 2 == 1, GPT-NeoX style
1463
1464
  // if mode & 4 == 1, ChatGLM style
1464
1465
  //
1465
1466
  // b is an int32 vector with size a->ne[2], it contains the positions
1467
+ // c is freq factors (e.g. phi3-128k), (optional)
1466
1468
  GGML_API struct ggml_tensor * ggml_rope(
1467
1469
  struct ggml_context * ctx,
1468
1470
  struct ggml_tensor * a,
@@ -1481,10 +1483,11 @@ extern "C" {
1481
1483
  int n_ctx);
1482
1484
 
1483
1485
  // custom RoPE
1484
- GGML_API struct ggml_tensor * ggml_rope_custom(
1486
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1485
1487
  struct ggml_context * ctx,
1486
1488
  struct ggml_tensor * a,
1487
1489
  struct ggml_tensor * b,
1490
+ struct ggml_tensor * c,
1488
1491
  int n_dims,
1489
1492
  int mode,
1490
1493
  int n_ctx,
@@ -1497,10 +1500,11 @@ extern "C" {
1497
1500
  float beta_slow);
1498
1501
 
1499
1502
  // in-place, returns view(a)
1500
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1503
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1501
1504
  struct ggml_context * ctx,
1502
1505
  struct ggml_tensor * a,
1503
1506
  struct ggml_tensor * b,
1507
+ struct ggml_tensor * c,
1504
1508
  int n_dims,
1505
1509
  int mode,
1506
1510
  int n_ctx,
@@ -1512,18 +1516,41 @@ extern "C" {
1512
1516
  float beta_fast,
1513
1517
  float beta_slow);
1514
1518
 
1515
- // compute correction dims for YaRN RoPE scaling
1516
- GGML_CALL void ggml_rope_yarn_corr_dims(
1517
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1519
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1520
+ struct ggml_context * ctx,
1521
+ struct ggml_tensor * a,
1522
+ struct ggml_tensor * b,
1523
+ int n_dims,
1524
+ int mode,
1525
+ int n_ctx,
1526
+ int n_orig_ctx,
1527
+ float freq_base,
1528
+ float freq_scale,
1529
+ float ext_factor,
1530
+ float attn_factor,
1531
+ float beta_fast,
1532
+ float beta_slow),
1533
+ "use ggml_rope_ext instead");
1518
1534
 
1519
- // xPos RoPE, in-place, returns view(a)
1520
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1535
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1521
1536
  struct ggml_context * ctx,
1522
1537
  struct ggml_tensor * a,
1523
1538
  struct ggml_tensor * b,
1524
1539
  int n_dims,
1525
- float base,
1526
- bool down);
1540
+ int mode,
1541
+ int n_ctx,
1542
+ int n_orig_ctx,
1543
+ float freq_base,
1544
+ float freq_scale,
1545
+ float ext_factor,
1546
+ float attn_factor,
1547
+ float beta_fast,
1548
+ float beta_slow),
1549
+ "use ggml_rope_ext_inplace instead");
1550
+
1551
+ // compute correction dims for YaRN RoPE scaling
1552
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1553
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1527
1554
 
1528
1555
  // rotary position embedding backward, i.e compute dx from dy
1529
1556
  // a - dy
@@ -1531,6 +1558,7 @@ extern "C" {
1531
1558
  struct ggml_context * ctx,
1532
1559
  struct ggml_tensor * a,
1533
1560
  struct ggml_tensor * b,
1561
+ struct ggml_tensor * c,
1534
1562
  int n_dims,
1535
1563
  int mode,
1536
1564
  int n_ctx,
@@ -1673,12 +1701,24 @@ extern "C" {
1673
1701
  float p1);
1674
1702
 
1675
1703
  // nearest interpolate
1704
+ // multiplies ne0 and ne1 by scale factor
1676
1705
  // used in stable-diffusion
1677
1706
  GGML_API struct ggml_tensor * ggml_upscale(
1678
1707
  struct ggml_context * ctx,
1679
1708
  struct ggml_tensor * a,
1680
1709
  int scale_factor);
1681
1710
 
1711
+ // nearest interpolate
1712
+ // nearest interpolate to specified dimensions
1713
+ // used in tortoise.cpp
1714
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1715
+ struct ggml_context * ctx,
1716
+ struct ggml_tensor * a,
1717
+ int ne0,
1718
+ int ne1,
1719
+ int ne2,
1720
+ int ne3);
1721
+
1682
1722
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1683
1723
  GGML_API struct ggml_tensor * ggml_pad(
1684
1724
  struct ggml_context * ctx,
@@ -1720,13 +1760,6 @@ extern "C" {
1720
1760
  struct ggml_tensor * a,
1721
1761
  int k);
1722
1762
 
1723
- GGML_API struct ggml_tensor * ggml_flash_attn(
1724
- struct ggml_context * ctx,
1725
- struct ggml_tensor * q,
1726
- struct ggml_tensor * k,
1727
- struct ggml_tensor * v,
1728
- bool masked);
1729
-
1730
1763
  #define GGML_KQ_MASK_PAD 32
1731
1764
 
1732
1765
  // q: [n_embd, n_batch, n_head, 1]
@@ -1747,6 +1780,7 @@ extern "C" {
1747
1780
  struct ggml_tensor * a,
1748
1781
  enum ggml_prec prec);
1749
1782
 
1783
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1750
1784
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1751
1785
  struct ggml_context * ctx,
1752
1786
  struct ggml_tensor * q,
@@ -1755,14 +1789,6 @@ extern "C" {
1755
1789
  struct ggml_tensor * d,
1756
1790
  bool masked);
1757
1791
 
1758
- GGML_API struct ggml_tensor * ggml_flash_ff(
1759
- struct ggml_context * ctx,
1760
- struct ggml_tensor * a,
1761
- struct ggml_tensor * b0,
1762
- struct ggml_tensor * b1,
1763
- struct ggml_tensor * c0,
1764
- struct ggml_tensor * c1);
1765
-
1766
1792
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1767
1793
  struct ggml_context * ctx,
1768
1794
  struct ggml_tensor * s,
@@ -2376,8 +2402,10 @@ extern "C" {
2376
2402
  GGML_API int ggml_cpu_has_avx512 (void);
2377
2403
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2378
2404
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2405
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2379
2406
  GGML_API int ggml_cpu_has_fma (void);
2380
2407
  GGML_API int ggml_cpu_has_neon (void);
2408
+ GGML_API int ggml_cpu_has_sve (void);
2381
2409
  GGML_API int ggml_cpu_has_arm_fma (void);
2382
2410
  GGML_API int ggml_cpu_has_metal (void);
2383
2411
  GGML_API int ggml_cpu_has_f16c (void);