@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
package/CMakeLists.txt CHANGED
@@ -62,6 +62,8 @@ if (VULKAN_SDK)
62
62
  find_package(Vulkan REQUIRED)
63
63
  endif()
64
64
 
65
+ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
66
+
65
67
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
66
68
  add_subdirectory("src/llama.cpp")
67
69
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.2",
4
+ "version": "0.3.3",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
8
8
  _tokens(std::move(tokens)) {}
9
9
 
10
10
  void DetokenizeWorker::Execute() {
11
- const auto text = ::llama_detokenize(_sess->context(), _tokens);
11
+ const auto text = ::common_detokenize(_sess->context(), _tokens);
12
12
  _text = std::move(text);
13
13
  }
14
14
 
@@ -7,7 +7,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
7
7
 
8
8
  void EmbeddingWorker::Execute() {
9
9
  llama_kv_cache_clear(_sess->context());
10
- auto tokens = ::llama_tokenize(_sess->context(), _text, true);
10
+ auto tokens = ::common_tokenize(_sess->context(), _text, true);
11
11
  // add SEP if not present
12
12
  if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
13
13
  tokens.push_back(llama_token_sep(_sess->model()));
@@ -16,7 +16,7 @@ void EmbeddingWorker::Execute() {
16
16
  do {
17
17
  int ret =
18
18
  llama_decode(_sess->context(),
19
- llama_batch_get_one(tokens.data(), tokens.size(), 0, 0));
19
+ llama_batch_get_one(tokens.data(), tokens.size()));
20
20
  if (ret < 0) {
21
21
  SetError("Failed to inference, code: " + std::to_string(ret));
22
22
  break;
@@ -34,7 +34,7 @@ size_t findStoppingStrings(const std::string &text,
34
34
 
35
35
  LlamaCompletionWorker::LlamaCompletionWorker(
36
36
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
37
- Napi::Function callback, gpt_params params,
37
+ Napi::Function callback, common_params params,
38
38
  std::vector<std::string> stop_words)
39
39
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
40
40
  _params(params), _stop_words(stop_words) {
@@ -64,11 +64,11 @@ void LlamaCompletionWorker::Execute() {
64
64
 
65
65
  auto sparams = llama_sampler_chain_default_params();
66
66
 
67
- LlamaCppSampling sampling{gpt_sampler_init(model, _params.sparams),
68
- gpt_sampler_free};
67
+ LlamaCppSampling sampling{common_sampler_init(model, _params.sparams),
68
+ common_sampler_free};
69
69
 
70
70
  std::vector<llama_token> prompt_tokens =
71
- ::llama_tokenize(ctx, _params.prompt, add_bos);
71
+ ::common_tokenize(ctx, _params.prompt, add_bos);
72
72
  n_input = prompt_tokens.size();
73
73
  if (_sess->tokens_ptr()->size() > 0) {
74
74
  n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
@@ -102,18 +102,18 @@ void LlamaCompletionWorker::Execute() {
102
102
  _result.truncated = true;
103
103
  }
104
104
  int ret = llama_decode(
105
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input, n_cur, 0));
105
+ ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
106
106
  if (ret < 0) {
107
107
  SetError("Failed to decode token, code: " + std::to_string(ret));
108
108
  break;
109
109
  }
110
110
  // sample the next token
111
111
  const llama_token new_token_id =
112
- gpt_sampler_sample(sampling.get(), ctx, -1);
113
- gpt_sampler_accept(sampling.get(), new_token_id, true);
112
+ common_sampler_sample(sampling.get(), ctx, -1);
113
+ common_sampler_accept(sampling.get(), new_token_id, true);
114
114
  // prepare the next batch
115
115
  embd->emplace_back(new_token_id);
116
- auto token = llama_token_to_piece(ctx, new_token_id);
116
+ auto token = common_token_to_piece(ctx, new_token_id);
117
117
  _result.text += token;
118
118
  n_cur += n_input;
119
119
  _result.tokens_evaluated += n_input;
@@ -12,7 +12,7 @@ class LlamaCompletionWorker : public Napi::AsyncWorker,
12
12
  public Napi::Promise::Deferred {
13
13
  public:
14
14
  LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
15
- Napi::Function callback, gpt_params params,
15
+ Napi::Function callback, common_params params,
16
16
  std::vector<std::string> stop_words = {});
17
17
 
18
18
  ~LlamaCompletionWorker();
@@ -28,7 +28,7 @@ protected:
28
28
 
29
29
  private:
30
30
  LlamaSessionPtr _sess;
31
- gpt_params _params;
31
+ common_params _params;
32
32
  std::vector<std::string> _stop_words;
33
33
  Napi::ThreadSafeFunction _tsfn;
34
34
  bool _has_callback = false;
@@ -7,8 +7,8 @@
7
7
  #include "SaveSessionWorker.h"
8
8
  #include "TokenizeWorker.h"
9
9
 
10
- std::vector<llama_chat_msg> get_messages(Napi::Array messages) {
11
- std::vector<llama_chat_msg> chat;
10
+ std::vector<common_chat_msg> get_messages(Napi::Array messages) {
11
+ std::vector<common_chat_msg> chat;
12
12
  for (size_t i = 0; i < messages.Length(); i++) {
13
13
  auto message = messages.Get(i).As<Napi::Object>();
14
14
  chat.push_back({
@@ -67,7 +67,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
67
67
  }
68
68
  auto options = info[0].As<Napi::Object>();
69
69
 
70
- gpt_params params;
70
+ common_params params;
71
71
  params.model = get_option<std::string>(options, "model", "");
72
72
  if (params.model.empty()) {
73
73
  Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
@@ -86,7 +86,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
86
86
  llama_backend_init();
87
87
  llama_numa_init(params.numa);
88
88
 
89
- auto result = llama_init_from_gpt_params(params);
89
+ auto result = common_init_from_params(params);
90
90
 
91
91
  if (result.model == nullptr || result.context == nullptr) {
92
92
  Napi::TypeError::New(env, "Failed to load model")
@@ -94,7 +94,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
94
94
  }
95
95
 
96
96
  _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
97
- _info = gpt_params_get_system_info(params);
97
+ _info = common_params_get_system_info(params);
98
98
  }
99
99
 
100
100
  // getSystemInfo(): string
@@ -109,7 +109,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
109
109
  Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
110
110
  }
111
111
  auto messages = info[0].As<Napi::Array>();
112
- auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
112
+ auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
113
113
  return Napi::String::New(env, formatted);
114
114
  }
115
115
 
@@ -133,10 +133,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
133
133
  }
134
134
  auto options = info[0].As<Napi::Object>();
135
135
 
136
- gpt_params params = _sess->params();
136
+ common_params params = _sess->params();
137
137
  if (options.Has("messages") && options.Get("messages").IsArray()) {
138
138
  auto messages = options.Get("messages").As<Napi::Array>();
139
- auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
139
+ auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
140
140
  params.prompt = formatted;
141
141
  } else {
142
142
  params.prompt = get_option<std::string>(options, "prompt", "");
@@ -150,7 +150,6 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
150
150
  params.sparams.top_k = get_option<int32_t>(options, "top_k", 40);
151
151
  params.sparams.top_p = get_option<float>(options, "top_p", 0.95f);
152
152
  params.sparams.min_p = get_option<float>(options, "min_p", 0.05f);
153
- params.sparams.tfs_z = get_option<float>(options, "tfs_z", 1.00f);
154
153
  params.sparams.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
155
154
  params.sparams.mirostat_tau =
156
155
  get_option<float>(options, "mirostat_tau", 5.00f);
@@ -6,7 +6,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
6
6
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
- const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
9
+ const auto tokens = ::common_tokenize(_sess->context(), _text, false);
10
10
  _result.tokens = std::move(tokens);
11
11
  }
12
12
 
package/src/common.hpp CHANGED
@@ -13,7 +13,7 @@
13
13
 
14
14
  typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
15
15
  typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
16
- typedef std::unique_ptr<gpt_sampler, decltype(&gpt_sampler_free)>
16
+ typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
17
17
  LlamaCppSampling;
18
18
  typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
19
19
 
@@ -47,7 +47,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
47
47
 
48
48
  class LlamaSession {
49
49
  public:
50
- LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
50
+ LlamaSession(llama_model *model, llama_context *ctx, common_params params)
51
51
  : model_(LlamaCppModel(model, llama_free_model)),
52
52
  ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
53
53
  tokens_.reserve(params.n_ctx);
@@ -65,7 +65,7 @@ public:
65
65
  tokens_ = std::move(tokens);
66
66
  }
67
67
 
68
- inline const gpt_params &params() const { return params_; }
68
+ inline const common_params &params() const { return params_; }
69
69
 
70
70
  inline std::mutex &get_mutex() { return mutex; }
71
71
 
@@ -79,7 +79,7 @@ public:
79
79
  private:
80
80
  LlamaCppModel model_;
81
81
  LlamaCppContext ctx_;
82
- const gpt_params params_;
82
+ const common_params params_;
83
83
  std::vector<llama_token> tokens_{};
84
84
  std::mutex mutex;
85
85
  };
@@ -55,7 +55,13 @@ jobs:
55
55
  sysctl -a
56
56
  mkdir build
57
57
  cd build
58
- cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
58
+ cmake .. \
59
+ -DLLAMA_FATAL_WARNINGS=ON \
60
+ -DLLAMA_CURL=ON \
61
+ -DGGML_METAL_USE_BF16=ON \
62
+ -DGGML_METAL_EMBED_LIBRARY=ON \
63
+ -DGGML_RPC=ON \
64
+ -DBUILD_SHARED_LIBS=OFF
59
65
  cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
60
66
 
61
67
  - name: Test
@@ -92,7 +98,7 @@ jobs:
92
98
  name: llama-bin-macos-arm64.zip
93
99
 
94
100
  macOS-latest-cmake-x64:
95
- runs-on: macos-12
101
+ runs-on: macos-13
96
102
 
97
103
  steps:
98
104
  - name: Clone
@@ -113,7 +119,12 @@ jobs:
113
119
  sysctl -a
114
120
  # Metal is disabled due to intermittent failures with Github runners not having a GPU:
115
121
  # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
116
- cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
122
+ cmake -B build \
123
+ -DLLAMA_FATAL_WARNINGS=ON \
124
+ -DLLAMA_CURL=ON \
125
+ -DGGML_METAL=OFF \
126
+ -DGGML_RPC=ON \
127
+ -DBUILD_SHARED_LIBS=OFF
117
128
  cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
118
129
 
119
130
  - name: Test
@@ -394,15 +405,36 @@ jobs:
394
405
  - name: Build with native CMake HIP support
395
406
  id: cmake_build
396
407
  run: |
397
- cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
408
+ cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
398
409
  cmake --build build --config Release -j $(nproc)
399
410
 
400
411
  - name: Build with legacy HIP support
401
412
  id: cmake_build_legacy_hip
402
413
  run: |
403
- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
414
+ cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
404
415
  cmake --build build2 --config Release -j $(nproc)
405
416
 
417
+ ubuntu-22-cmake-musa:
418
+ runs-on: ubuntu-22.04
419
+ container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
420
+
421
+ steps:
422
+ - name: Clone
423
+ id: checkout
424
+ uses: actions/checkout@v4
425
+
426
+ - name: Dependencies
427
+ id: depends
428
+ run: |
429
+ apt-get update
430
+ apt-get install -y build-essential git cmake libcurl4-openssl-dev
431
+
432
+ - name: Build with native CMake MUSA support
433
+ id: cmake_build
434
+ run: |
435
+ cmake -B build -S . -DGGML_MUSA=ON
436
+ cmake --build build --config Release -j $(nproc)
437
+
406
438
  ubuntu-22-cmake-sycl:
407
439
  runs-on: ubuntu-22.04
408
440
 
@@ -569,6 +601,7 @@ jobs:
569
601
  mkdir build
570
602
  cd build
571
603
  cmake -G Xcode .. \
604
+ -DGGML_METAL_USE_BF16=ON \
572
605
  -DGGML_METAL_EMBED_LIBRARY=ON \
573
606
  -DLLAMA_BUILD_EXAMPLES=OFF \
574
607
  -DLLAMA_BUILD_TESTS=OFF \
@@ -599,6 +632,7 @@ jobs:
599
632
  mkdir build
600
633
  cd build
601
634
  cmake -G Xcode .. \
635
+ -DGGML_METAL_USE_BF16=ON \
602
636
  -DGGML_METAL_EMBED_LIBRARY=ON \
603
637
  -DLLAMA_BUILD_EXAMPLES=OFF \
604
638
  -DLLAMA_BUILD_TESTS=OFF \
@@ -734,7 +768,7 @@ jobs:
734
768
  id: clone_kompute
735
769
  if: ${{ matrix.build == 'kompute-x64' }}
736
770
  run: |
737
- git submodule update --init ggml/src/kompute
771
+ git submodule update --init ggml/src/ggml-kompute/kompute
738
772
 
739
773
  - name: Download OpenBLAS
740
774
  id: get_openblas
@@ -917,7 +951,7 @@ jobs:
917
951
  shell: bash
918
952
 
919
953
  env:
920
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
954
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
921
955
  WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
922
956
  ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
923
957
  steps:
@@ -1001,7 +1035,7 @@ jobs:
1001
1035
  run: |
1002
1036
  $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
1003
1037
  $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
1004
- cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
1038
+ cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
1005
1039
  cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
1006
1040
 
1007
1041
  windows-latest-cmake-hip-release:
@@ -1037,7 +1071,7 @@ jobs:
1037
1071
  run: |
1038
1072
  $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
1039
1073
  $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
1040
- cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
1074
+ cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
1041
1075
  cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
1042
1076
  md "build\bin\rocblas\library\"
1043
1077
  cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@@ -43,6 +43,9 @@ jobs:
43
43
  - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
44
44
  - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
45
45
  - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
46
+ - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
47
+ - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
48
+ - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
46
49
  # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
47
50
  #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
48
51
  #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
@@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
63
63
  option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
64
64
 
65
65
  # utils
66
- option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
66
+ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
67
67
 
68
68
  # extra artifacts
69
69
  option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
88
88
  set(GGML_LLAMAFILE_DEFAULT ON)
89
89
  endif()
90
90
 
91
+ if (NOT DEFINED GGML_AMX)
92
+ set(GGML_AMX ON)
93
+ endif()
94
+
91
95
  if (NOT DEFINED GGML_CUDA_GRAPHS)
92
96
  set(GGML_CUDA_GRAPHS_DEFAULT ON)
93
97
  endif()
@@ -136,7 +140,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
136
140
  set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
137
141
  set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
138
142
 
139
-
140
143
  # At the moment some compile definitions are placed within the ggml/src
141
144
  # directory but not exported on the `ggml` target. This could be improved by
142
145
  # determining _precisely_ which defines are necessary for the llama-config
@@ -201,12 +204,12 @@ if (LLAMA_BUILD_COMMON)
201
204
  add_subdirectory(common)
202
205
  endif()
203
206
 
204
- if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
207
+ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
205
208
  include(CTest)
206
209
  add_subdirectory(tests)
207
210
  endif()
208
211
 
209
- if (LLAMA_BUILD_EXAMPLES)
212
+ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
210
213
  add_subdirectory(examples)
211
214
  add_subdirectory(pocs)
212
215
  endif()
@@ -0,0 +1,16 @@
1
+ set( CMAKE_SYSTEM_NAME Darwin )
2
+ set( CMAKE_SYSTEM_PROCESSOR arm64 )
3
+
4
+ set( target arm64-apple-darwin-macho )
5
+
6
+ set( CMAKE_C_COMPILER clang )
7
+ set( CMAKE_CXX_COMPILER clang++ )
8
+
9
+ set( CMAKE_C_COMPILER_TARGET ${target} )
10
+ set( CMAKE_CXX_COMPILER_TARGET ${target} )
11
+
12
+ set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13
+ set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
14
+
15
+ set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16
+ set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
@@ -66,8 +66,6 @@ add_library(${TARGET} STATIC
66
66
  ngram-cache.h
67
67
  sampling.cpp
68
68
  sampling.h
69
- train.cpp
70
- train.h
71
69
  )
72
70
 
73
71
  if (BUILD_SHARED_LIBS)