@fugood/llama.node 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +37 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +20 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  60. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  61. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  62. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  63. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  64. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
  65. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  66. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
  67. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  68. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  69. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  70. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  71. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  72. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
  73. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  74. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  75. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  76. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  78. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  79. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  82. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
  83. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  84. package/src/llama.cpp/include/llama.h +86 -22
  85. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  86. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  87. package/src/llama.cpp/src/llama-adapter.h +11 -9
  88. package/src/llama.cpp/src/llama-arch.cpp +103 -16
  89. package/src/llama.cpp/src/llama-arch.h +18 -0
  90. package/src/llama.cpp/src/llama-batch.h +2 -2
  91. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  92. package/src/llama.cpp/src/llama-context.h +214 -77
  93. package/src/llama.cpp/src/llama-cparams.h +1 -0
  94. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  95. package/src/llama.cpp/src/llama-graph.h +574 -0
  96. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  97. package/src/llama.cpp/src/llama-hparams.h +9 -0
  98. package/src/llama.cpp/src/llama-io.cpp +15 -0
  99. package/src/llama.cpp/src/llama-io.h +35 -0
  100. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  101. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  102. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  103. package/src/llama.cpp/src/llama-memory.h +21 -0
  104. package/src/llama.cpp/src/llama-model.cpp +8244 -173
  105. package/src/llama.cpp/src/llama-model.h +34 -1
  106. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  107. package/src/llama.cpp/src/llama.cpp +51 -9984
  108. package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
  109. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  110. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -2,7 +2,9 @@
2
2
 
3
3
  #include "llama.h"
4
4
  #include "llama-arch.h"
5
+ #include "llama-graph.h"
5
6
  #include "llama-hparams.h"
7
+ #include "llama-memory.h"
6
8
  #include "llama-vocab.h"
7
9
 
8
10
  #include <memory>
@@ -10,6 +12,8 @@
10
12
  #include <unordered_map>
11
13
  #include <vector>
12
14
 
15
+ struct llama_cparams;
16
+ struct llama_ubatch;
13
17
  struct llama_model_loader;
14
18
 
15
19
  // available models
@@ -25,6 +29,7 @@ enum llm_type {
25
29
  LLM_TYPE_109M,
26
30
  LLM_TYPE_137M,
27
31
  LLM_TYPE_160M,
32
+ LLM_TYPE_190M,
28
33
  LLM_TYPE_220M,
29
34
  LLM_TYPE_250M,
30
35
  LLM_TYPE_270M,
@@ -41,6 +46,7 @@ enum llm_type {
41
46
  LLM_TYPE_1_6B,
42
47
  LLM_TYPE_2B,
43
48
  LLM_TYPE_2_8B,
49
+ LLM_TYPE_2_9B,
44
50
  LLM_TYPE_3B,
45
51
  LLM_TYPE_4B,
46
52
  LLM_TYPE_6B,
@@ -256,6 +262,20 @@ struct llama_layer {
256
262
  struct ggml_tensor * time_mix_receptance_b = nullptr;
257
263
  struct ggml_tensor * time_mix_gate = nullptr;
258
264
 
265
+ // rwkv7
266
+ struct ggml_tensor * time_mix_w0 = nullptr;
267
+ struct ggml_tensor * time_mix_a0 = nullptr;
268
+ struct ggml_tensor * time_mix_a1 = nullptr;
269
+ struct ggml_tensor * time_mix_a2 = nullptr;
270
+ struct ggml_tensor * time_mix_v0 = nullptr;
271
+ struct ggml_tensor * time_mix_v1 = nullptr;
272
+ struct ggml_tensor * time_mix_v2 = nullptr;
273
+ struct ggml_tensor * time_mix_g1 = nullptr;
274
+ struct ggml_tensor * time_mix_g2 = nullptr;
275
+ struct ggml_tensor * time_mix_k_k = nullptr;
276
+ struct ggml_tensor * time_mix_k_a = nullptr;
277
+ struct ggml_tensor * time_mix_r_k = nullptr;
278
+
259
279
  struct ggml_tensor * time_mix_ln = nullptr;
260
280
  struct ggml_tensor * time_mix_ln_b = nullptr;
261
281
  struct ggml_tensor * time_mix_output = nullptr;
@@ -347,7 +367,7 @@ struct llama_model {
347
367
  std::string desc() const;
348
368
 
349
369
  size_t size() const;
350
- size_t max_nodes() const;
370
+ size_t n_tensors() const;
351
371
  size_t n_devices() const;
352
372
 
353
373
  // total number of parameters in the model
@@ -362,9 +382,22 @@ struct llama_model {
362
382
 
363
383
  const struct ggml_tensor * get_tensor(const char * name) const;
364
384
 
385
+ // TODO: move this to new llm_arch_model_i interface
386
+ llama_memory_i * create_memory() const; // TODO: params
387
+
388
+ // TODO: move this to new llm_arch_model_i interface
389
+ llm_graph_result_ptr build_graph(
390
+ const llm_graph_params & params,
391
+ ggml_cgraph * gf,
392
+ llm_graph_type type) const;
393
+
365
394
  private:
366
395
  struct impl;
367
396
  std::unique_ptr<impl> pimpl;
368
397
  };
369
398
 
370
399
  const char * llm_type_name(llm_type type);
400
+
401
+ // For internal test use
402
+ // TODO: remove
403
+ const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
@@ -756,10 +756,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
756
  // NOTE: can't use LLM_TN here because the layer number is not known
757
757
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
758
758
 
759
- // do not quantize RWKV's time_mix_first tensors
759
+ // do not quantize RWKV's small yet 2D weights
760
760
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
761
+ quantize &= name.find("time_mix_w0.weight") == std::string::npos;
761
762
  quantize &= name.find("time_mix_w1.weight") == std::string::npos;
762
763
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
764
+ quantize &= name.find("time_mix_v0.weight") == std::string::npos;
765
+ quantize &= name.find("time_mix_v1.weight") == std::string::npos;
766
+ quantize &= name.find("time_mix_v2.weight") == std::string::npos;
767
+ quantize &= name.find("time_mix_a0.weight") == std::string::npos;
768
+ quantize &= name.find("time_mix_a1.weight") == std::string::npos;
769
+ quantize &= name.find("time_mix_a2.weight") == std::string::npos;
770
+ quantize &= name.find("time_mix_g1.weight") == std::string::npos;
771
+ quantize &= name.find("time_mix_g2.weight") == std::string::npos;
763
772
  quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
764
773
  quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
765
774
  quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;