@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "log.h"
4
4
  #include "sampling.h"
5
+ #include "chat.h"
5
6
 
6
7
  #include <algorithm>
7
8
  #include <climits>
@@ -812,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
812
813
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
813
814
  add_opt(common_arg(
814
815
  {"-p", "--prompt"}, "PROMPT",
815
- ex == LLAMA_EXAMPLE_MAIN
816
- ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
817
- : "prompt to start generation with",
816
+ "prompt to start generation with; for system message, use -sys",
818
817
  [](common_params & params, const std::string & value) {
819
818
  params.prompt = value;
820
819
  }
821
820
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
821
+ add_opt(common_arg(
822
+ {"-sys", "--system-prompt"}, "PROMPT",
823
+ "system prompt to use with model (if applicable, depending on chat template)",
824
+ [](common_params & params, const std::string & value) {
825
+ params.system_prompt = value;
826
+ }
827
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
822
828
  add_opt(common_arg(
823
829
  {"--no-perf"},
824
830
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -943,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
943
949
  params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
944
950
  }
945
951
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
952
+ add_opt(common_arg(
953
+ {"-st", "--single-turn"},
954
+ "run conversation for a single turn only, then exit when done\n"
955
+ "will not be interactive if first turn is predefined with --prompt\n"
956
+ "(default: false)",
957
+ [](common_params & params) {
958
+ params.single_turn = true;
959
+ }
960
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
946
961
  add_opt(common_arg(
947
962
  {"-i", "--interactive"},
948
963
  string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
@@ -1852,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1852
1867
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1853
1868
  add_opt(common_arg(
1854
1869
  {"-o", "--output", "--output-file"}, "FNAME",
1855
- string_format("output file (default: '%s')",
1856
- ex == LLAMA_EXAMPLE_EXPORT_LORA
1857
- ? params.lora_outfile.c_str()
1858
- : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1859
- ? params.cvector_outfile.c_str()
1860
- : params.out_file.c_str()),
1870
+ string_format("output file (default: '%s')", params.out_file.c_str()),
1861
1871
  [](common_params & params, const std::string & value) {
1862
1872
  params.out_file = value;
1863
- params.cvector_outfile = value;
1864
- params.lora_outfile = value;
1865
1873
  }
1866
1874
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1867
1875
  add_opt(common_arg(
@@ -2446,6 +2454,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2446
2454
  params.vocoder.use_guide_tokens = true;
2447
2455
  }
2448
2456
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2457
+ add_opt(common_arg(
2458
+ {"--tts-speaker-file"}, "FNAME",
2459
+ "speaker file path for audio generation",
2460
+ [](common_params & params, const std::string & value) {
2461
+ params.vocoder.speaker_file = value;
2462
+ }
2463
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
2449
2464
 
2450
2465
  // model-specific
2451
2466
  add_opt(common_arg(
@@ -2501,5 +2516,91 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2501
2516
  }
2502
2517
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2503
2518
 
2519
+ add_opt(common_arg(
2520
+ {"--fim-qwen-1.5b-default"},
2521
+ string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2522
+ [](common_params & params) {
2523
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2524
+ params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2525
+ params.port = 8012;
2526
+ params.n_gpu_layers = 99;
2527
+ params.flash_attn = true;
2528
+ params.n_ubatch = 1024;
2529
+ params.n_batch = 1024;
2530
+ params.n_ctx = 0;
2531
+ params.n_cache_reuse = 256;
2532
+ }
2533
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2534
+
2535
+ add_opt(common_arg(
2536
+ {"--fim-qwen-3b-default"},
2537
+ string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2538
+ [](common_params & params) {
2539
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2540
+ params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2541
+ params.port = 8012;
2542
+ params.n_gpu_layers = 99;
2543
+ params.flash_attn = true;
2544
+ params.n_ubatch = 1024;
2545
+ params.n_batch = 1024;
2546
+ params.n_ctx = 0;
2547
+ params.n_cache_reuse = 256;
2548
+ }
2549
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2550
+
2551
+ add_opt(common_arg(
2552
+ {"--fim-qwen-7b-default"},
2553
+ string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2554
+ [](common_params & params) {
2555
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2556
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2557
+ params.port = 8012;
2558
+ params.n_gpu_layers = 99;
2559
+ params.flash_attn = true;
2560
+ params.n_ubatch = 1024;
2561
+ params.n_batch = 1024;
2562
+ params.n_ctx = 0;
2563
+ params.n_cache_reuse = 256;
2564
+ }
2565
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2566
+
2567
+ add_opt(common_arg(
2568
+ {"--fim-qwen-7b-spec"},
2569
+ string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2570
+ [](common_params & params) {
2571
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2572
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2573
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2574
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2575
+ params.speculative.n_gpu_layers = 99;
2576
+ params.port = 8012;
2577
+ params.n_gpu_layers = 99;
2578
+ params.flash_attn = true;
2579
+ params.n_ubatch = 1024;
2580
+ params.n_batch = 1024;
2581
+ params.n_ctx = 0;
2582
+ params.n_cache_reuse = 256;
2583
+ }
2584
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2585
+
2586
+ add_opt(common_arg(
2587
+ {"--fim-qwen-14b-spec"},
2588
+ string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2589
+ [](common_params & params) {
2590
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2591
+ params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2592
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2593
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2594
+ params.speculative.n_gpu_layers = 99;
2595
+ params.port = 8012;
2596
+ params.n_gpu_layers = 99;
2597
+ params.flash_attn = true;
2598
+ params.n_ubatch = 1024;
2599
+ params.n_batch = 1024;
2600
+ params.n_ctx = 0;
2601
+ params.n_cache_reuse = 256;
2602
+ }
2603
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2604
+
2504
2605
  return ctx_arg;
2505
2606
  }