@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
package/CMakeLists.txt
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
package/src/DetokenizeWorker.cpp
CHANGED
|
@@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
|
|
|
8
8
|
_tokens(std::move(tokens)) {}
|
|
9
9
|
|
|
10
10
|
void DetokenizeWorker::Execute() {
|
|
11
|
-
const auto text = ::
|
|
11
|
+
const auto text = ::common_detokenize(_sess->context(), _tokens);
|
|
12
12
|
_text = std::move(text);
|
|
13
13
|
}
|
|
14
14
|
|
package/src/EmbeddingWorker.cpp
CHANGED
|
@@ -7,7 +7,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
|
7
7
|
|
|
8
8
|
void EmbeddingWorker::Execute() {
|
|
9
9
|
llama_kv_cache_clear(_sess->context());
|
|
10
|
-
auto tokens = ::
|
|
10
|
+
auto tokens = ::common_tokenize(_sess->context(), _text, true);
|
|
11
11
|
// add SEP if not present
|
|
12
12
|
if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
|
|
13
13
|
tokens.push_back(llama_token_sep(_sess->model()));
|
|
@@ -16,7 +16,7 @@ void EmbeddingWorker::Execute() {
|
|
|
16
16
|
do {
|
|
17
17
|
int ret =
|
|
18
18
|
llama_decode(_sess->context(),
|
|
19
|
-
llama_batch_get_one(tokens.data(), tokens.size()
|
|
19
|
+
llama_batch_get_one(tokens.data(), tokens.size()));
|
|
20
20
|
if (ret < 0) {
|
|
21
21
|
SetError("Failed to inference, code: " + std::to_string(ret));
|
|
22
22
|
break;
|
|
@@ -34,7 +34,7 @@ size_t findStoppingStrings(const std::string &text,
|
|
|
34
34
|
|
|
35
35
|
LlamaCompletionWorker::LlamaCompletionWorker(
|
|
36
36
|
const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
37
|
-
Napi::Function callback,
|
|
37
|
+
Napi::Function callback, common_params params,
|
|
38
38
|
std::vector<std::string> stop_words)
|
|
39
39
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
40
40
|
_params(params), _stop_words(stop_words) {
|
|
@@ -64,11 +64,11 @@ void LlamaCompletionWorker::Execute() {
|
|
|
64
64
|
|
|
65
65
|
auto sparams = llama_sampler_chain_default_params();
|
|
66
66
|
|
|
67
|
-
LlamaCppSampling sampling{
|
|
68
|
-
|
|
67
|
+
LlamaCppSampling sampling{common_sampler_init(model, _params.sparams),
|
|
68
|
+
common_sampler_free};
|
|
69
69
|
|
|
70
70
|
std::vector<llama_token> prompt_tokens =
|
|
71
|
-
::
|
|
71
|
+
::common_tokenize(ctx, _params.prompt, add_bos);
|
|
72
72
|
n_input = prompt_tokens.size();
|
|
73
73
|
if (_sess->tokens_ptr()->size() > 0) {
|
|
74
74
|
n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
|
|
@@ -102,18 +102,18 @@ void LlamaCompletionWorker::Execute() {
|
|
|
102
102
|
_result.truncated = true;
|
|
103
103
|
}
|
|
104
104
|
int ret = llama_decode(
|
|
105
|
-
ctx, llama_batch_get_one(embd->data() + n_cur, n_input
|
|
105
|
+
ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
|
|
106
106
|
if (ret < 0) {
|
|
107
107
|
SetError("Failed to decode token, code: " + std::to_string(ret));
|
|
108
108
|
break;
|
|
109
109
|
}
|
|
110
110
|
// sample the next token
|
|
111
111
|
const llama_token new_token_id =
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
common_sampler_sample(sampling.get(), ctx, -1);
|
|
113
|
+
common_sampler_accept(sampling.get(), new_token_id, true);
|
|
114
114
|
// prepare the next batch
|
|
115
115
|
embd->emplace_back(new_token_id);
|
|
116
|
-
auto token =
|
|
116
|
+
auto token = common_token_to_piece(ctx, new_token_id);
|
|
117
117
|
_result.text += token;
|
|
118
118
|
n_cur += n_input;
|
|
119
119
|
_result.tokens_evaluated += n_input;
|
|
@@ -12,7 +12,7 @@ class LlamaCompletionWorker : public Napi::AsyncWorker,
|
|
|
12
12
|
public Napi::Promise::Deferred {
|
|
13
13
|
public:
|
|
14
14
|
LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
15
|
-
Napi::Function callback,
|
|
15
|
+
Napi::Function callback, common_params params,
|
|
16
16
|
std::vector<std::string> stop_words = {});
|
|
17
17
|
|
|
18
18
|
~LlamaCompletionWorker();
|
|
@@ -28,7 +28,7 @@ protected:
|
|
|
28
28
|
|
|
29
29
|
private:
|
|
30
30
|
LlamaSessionPtr _sess;
|
|
31
|
-
|
|
31
|
+
common_params _params;
|
|
32
32
|
std::vector<std::string> _stop_words;
|
|
33
33
|
Napi::ThreadSafeFunction _tsfn;
|
|
34
34
|
bool _has_callback = false;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -7,8 +7,8 @@
|
|
|
7
7
|
#include "SaveSessionWorker.h"
|
|
8
8
|
#include "TokenizeWorker.h"
|
|
9
9
|
|
|
10
|
-
std::vector<
|
|
11
|
-
std::vector<
|
|
10
|
+
std::vector<common_chat_msg> get_messages(Napi::Array messages) {
|
|
11
|
+
std::vector<common_chat_msg> chat;
|
|
12
12
|
for (size_t i = 0; i < messages.Length(); i++) {
|
|
13
13
|
auto message = messages.Get(i).As<Napi::Object>();
|
|
14
14
|
chat.push_back({
|
|
@@ -67,7 +67,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
67
67
|
}
|
|
68
68
|
auto options = info[0].As<Napi::Object>();
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
common_params params;
|
|
71
71
|
params.model = get_option<std::string>(options, "model", "");
|
|
72
72
|
if (params.model.empty()) {
|
|
73
73
|
Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
|
|
@@ -86,7 +86,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
86
86
|
llama_backend_init();
|
|
87
87
|
llama_numa_init(params.numa);
|
|
88
88
|
|
|
89
|
-
auto result =
|
|
89
|
+
auto result = common_init_from_params(params);
|
|
90
90
|
|
|
91
91
|
if (result.model == nullptr || result.context == nullptr) {
|
|
92
92
|
Napi::TypeError::New(env, "Failed to load model")
|
|
@@ -94,7 +94,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
_sess = std::make_shared<LlamaSession>(result.model, result.context, params);
|
|
97
|
-
_info =
|
|
97
|
+
_info = common_params_get_system_info(params);
|
|
98
98
|
}
|
|
99
99
|
|
|
100
100
|
// getSystemInfo(): string
|
|
@@ -109,7 +109,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
109
109
|
Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
|
|
110
110
|
}
|
|
111
111
|
auto messages = info[0].As<Napi::Array>();
|
|
112
|
-
auto formatted =
|
|
112
|
+
auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
|
|
113
113
|
return Napi::String::New(env, formatted);
|
|
114
114
|
}
|
|
115
115
|
|
|
@@ -133,10 +133,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
133
133
|
}
|
|
134
134
|
auto options = info[0].As<Napi::Object>();
|
|
135
135
|
|
|
136
|
-
|
|
136
|
+
common_params params = _sess->params();
|
|
137
137
|
if (options.Has("messages") && options.Get("messages").IsArray()) {
|
|
138
138
|
auto messages = options.Get("messages").As<Napi::Array>();
|
|
139
|
-
auto formatted =
|
|
139
|
+
auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
|
|
140
140
|
params.prompt = formatted;
|
|
141
141
|
} else {
|
|
142
142
|
params.prompt = get_option<std::string>(options, "prompt", "");
|
|
@@ -150,7 +150,6 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
150
150
|
params.sparams.top_k = get_option<int32_t>(options, "top_k", 40);
|
|
151
151
|
params.sparams.top_p = get_option<float>(options, "top_p", 0.95f);
|
|
152
152
|
params.sparams.min_p = get_option<float>(options, "min_p", 0.05f);
|
|
153
|
-
params.sparams.tfs_z = get_option<float>(options, "tfs_z", 1.00f);
|
|
154
153
|
params.sparams.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
|
|
155
154
|
params.sparams.mirostat_tau =
|
|
156
155
|
get_option<float>(options, "mirostat_tau", 5.00f);
|
package/src/TokenizeWorker.cpp
CHANGED
|
@@ -6,7 +6,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
|
|
|
6
6
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
|
|
7
7
|
|
|
8
8
|
void TokenizeWorker::Execute() {
|
|
9
|
-
const auto tokens = ::
|
|
9
|
+
const auto tokens = ::common_tokenize(_sess->context(), _text, false);
|
|
10
10
|
_result.tokens = std::move(tokens);
|
|
11
11
|
}
|
|
12
12
|
|
package/src/common.hpp
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
|
|
14
14
|
typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
|
|
15
15
|
typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
|
|
16
|
-
typedef std::unique_ptr<
|
|
16
|
+
typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
|
|
17
17
|
LlamaCppSampling;
|
|
18
18
|
typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
|
|
19
19
|
|
|
@@ -47,7 +47,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
|
|
|
47
47
|
|
|
48
48
|
class LlamaSession {
|
|
49
49
|
public:
|
|
50
|
-
LlamaSession(llama_model *model, llama_context *ctx,
|
|
50
|
+
LlamaSession(llama_model *model, llama_context *ctx, common_params params)
|
|
51
51
|
: model_(LlamaCppModel(model, llama_free_model)),
|
|
52
52
|
ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
|
|
53
53
|
tokens_.reserve(params.n_ctx);
|
|
@@ -65,7 +65,7 @@ public:
|
|
|
65
65
|
tokens_ = std::move(tokens);
|
|
66
66
|
}
|
|
67
67
|
|
|
68
|
-
inline const
|
|
68
|
+
inline const common_params ¶ms() const { return params_; }
|
|
69
69
|
|
|
70
70
|
inline std::mutex &get_mutex() { return mutex; }
|
|
71
71
|
|
|
@@ -79,7 +79,7 @@ public:
|
|
|
79
79
|
private:
|
|
80
80
|
LlamaCppModel model_;
|
|
81
81
|
LlamaCppContext ctx_;
|
|
82
|
-
const
|
|
82
|
+
const common_params params_;
|
|
83
83
|
std::vector<llama_token> tokens_{};
|
|
84
84
|
std::mutex mutex;
|
|
85
85
|
};
|
|
@@ -55,7 +55,13 @@ jobs:
|
|
|
55
55
|
sysctl -a
|
|
56
56
|
mkdir build
|
|
57
57
|
cd build
|
|
58
|
-
cmake
|
|
58
|
+
cmake .. \
|
|
59
|
+
-DLLAMA_FATAL_WARNINGS=ON \
|
|
60
|
+
-DLLAMA_CURL=ON \
|
|
61
|
+
-DGGML_METAL_USE_BF16=ON \
|
|
62
|
+
-DGGML_METAL_EMBED_LIBRARY=ON \
|
|
63
|
+
-DGGML_RPC=ON \
|
|
64
|
+
-DBUILD_SHARED_LIBS=OFF
|
|
59
65
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
|
60
66
|
|
|
61
67
|
- name: Test
|
|
@@ -92,7 +98,7 @@ jobs:
|
|
|
92
98
|
name: llama-bin-macos-arm64.zip
|
|
93
99
|
|
|
94
100
|
macOS-latest-cmake-x64:
|
|
95
|
-
runs-on: macos-
|
|
101
|
+
runs-on: macos-13
|
|
96
102
|
|
|
97
103
|
steps:
|
|
98
104
|
- name: Clone
|
|
@@ -113,7 +119,12 @@ jobs:
|
|
|
113
119
|
sysctl -a
|
|
114
120
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
|
115
121
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
|
116
|
-
cmake -B build
|
|
122
|
+
cmake -B build \
|
|
123
|
+
-DLLAMA_FATAL_WARNINGS=ON \
|
|
124
|
+
-DLLAMA_CURL=ON \
|
|
125
|
+
-DGGML_METAL=OFF \
|
|
126
|
+
-DGGML_RPC=ON \
|
|
127
|
+
-DBUILD_SHARED_LIBS=OFF
|
|
117
128
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
|
118
129
|
|
|
119
130
|
- name: Test
|
|
@@ -394,15 +405,36 @@ jobs:
|
|
|
394
405
|
- name: Build with native CMake HIP support
|
|
395
406
|
id: cmake_build
|
|
396
407
|
run: |
|
|
397
|
-
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -
|
|
408
|
+
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
|
|
398
409
|
cmake --build build --config Release -j $(nproc)
|
|
399
410
|
|
|
400
411
|
- name: Build with legacy HIP support
|
|
401
412
|
id: cmake_build_legacy_hip
|
|
402
413
|
run: |
|
|
403
|
-
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -
|
|
414
|
+
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
|
404
415
|
cmake --build build2 --config Release -j $(nproc)
|
|
405
416
|
|
|
417
|
+
ubuntu-22-cmake-musa:
|
|
418
|
+
runs-on: ubuntu-22.04
|
|
419
|
+
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
|
|
420
|
+
|
|
421
|
+
steps:
|
|
422
|
+
- name: Clone
|
|
423
|
+
id: checkout
|
|
424
|
+
uses: actions/checkout@v4
|
|
425
|
+
|
|
426
|
+
- name: Dependencies
|
|
427
|
+
id: depends
|
|
428
|
+
run: |
|
|
429
|
+
apt-get update
|
|
430
|
+
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
431
|
+
|
|
432
|
+
- name: Build with native CMake MUSA support
|
|
433
|
+
id: cmake_build
|
|
434
|
+
run: |
|
|
435
|
+
cmake -B build -S . -DGGML_MUSA=ON
|
|
436
|
+
cmake --build build --config Release -j $(nproc)
|
|
437
|
+
|
|
406
438
|
ubuntu-22-cmake-sycl:
|
|
407
439
|
runs-on: ubuntu-22.04
|
|
408
440
|
|
|
@@ -569,6 +601,7 @@ jobs:
|
|
|
569
601
|
mkdir build
|
|
570
602
|
cd build
|
|
571
603
|
cmake -G Xcode .. \
|
|
604
|
+
-DGGML_METAL_USE_BF16=ON \
|
|
572
605
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
|
573
606
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
|
574
607
|
-DLLAMA_BUILD_TESTS=OFF \
|
|
@@ -599,6 +632,7 @@ jobs:
|
|
|
599
632
|
mkdir build
|
|
600
633
|
cd build
|
|
601
634
|
cmake -G Xcode .. \
|
|
635
|
+
-DGGML_METAL_USE_BF16=ON \
|
|
602
636
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
|
603
637
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
|
604
638
|
-DLLAMA_BUILD_TESTS=OFF \
|
|
@@ -734,7 +768,7 @@ jobs:
|
|
|
734
768
|
id: clone_kompute
|
|
735
769
|
if: ${{ matrix.build == 'kompute-x64' }}
|
|
736
770
|
run: |
|
|
737
|
-
git submodule update --init ggml/src/kompute
|
|
771
|
+
git submodule update --init ggml/src/ggml-kompute/kompute
|
|
738
772
|
|
|
739
773
|
- name: Download OpenBLAS
|
|
740
774
|
id: get_openblas
|
|
@@ -917,7 +951,7 @@ jobs:
|
|
|
917
951
|
shell: bash
|
|
918
952
|
|
|
919
953
|
env:
|
|
920
|
-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/
|
|
954
|
+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
|
921
955
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
|
922
956
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
|
923
957
|
steps:
|
|
@@ -1001,7 +1035,7 @@ jobs:
|
|
|
1001
1035
|
run: |
|
|
1002
1036
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
|
1003
1037
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
|
1004
|
-
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -
|
|
1038
|
+
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
|
1005
1039
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
|
1006
1040
|
|
|
1007
1041
|
windows-latest-cmake-hip-release:
|
|
@@ -1037,7 +1071,7 @@ jobs:
|
|
|
1037
1071
|
run: |
|
|
1038
1072
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
|
1039
1073
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
|
1040
|
-
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -
|
|
1074
|
+
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
|
1041
1075
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
|
1042
1076
|
md "build\bin\rocblas\library\"
|
|
1043
1077
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
|
@@ -43,6 +43,9 @@ jobs:
|
|
|
43
43
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
44
44
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
45
45
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
46
|
+
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
|
47
|
+
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
|
48
|
+
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
|
46
49
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
|
47
50
|
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
48
51
|
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
@@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
|
|
63
63
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
|
64
64
|
|
|
65
65
|
# utils
|
|
66
|
-
option(LLAMA_BUILD_COMMON "llama: build common utils library"
|
|
66
|
+
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
|
67
67
|
|
|
68
68
|
# extra artifacts
|
|
69
69
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
|
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
|
|
|
88
88
|
set(GGML_LLAMAFILE_DEFAULT ON)
|
|
89
89
|
endif()
|
|
90
90
|
|
|
91
|
+
if (NOT DEFINED GGML_AMX)
|
|
92
|
+
set(GGML_AMX ON)
|
|
93
|
+
endif()
|
|
94
|
+
|
|
91
95
|
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
|
92
96
|
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
|
93
97
|
endif()
|
|
@@ -136,7 +140,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
|
|
|
136
140
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
137
141
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
|
138
142
|
|
|
139
|
-
|
|
140
143
|
# At the moment some compile definitions are placed within the ggml/src
|
|
141
144
|
# directory but not exported on the `ggml` target. This could be improved by
|
|
142
145
|
# determining _precisely_ which defines are necessary for the llama-config
|
|
@@ -201,12 +204,12 @@ if (LLAMA_BUILD_COMMON)
|
|
|
201
204
|
add_subdirectory(common)
|
|
202
205
|
endif()
|
|
203
206
|
|
|
204
|
-
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
|
207
|
+
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
|
205
208
|
include(CTest)
|
|
206
209
|
add_subdirectory(tests)
|
|
207
210
|
endif()
|
|
208
211
|
|
|
209
|
-
if (LLAMA_BUILD_EXAMPLES)
|
|
212
|
+
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
|
210
213
|
add_subdirectory(examples)
|
|
211
214
|
add_subdirectory(pocs)
|
|
212
215
|
endif()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
set( CMAKE_SYSTEM_NAME Darwin )
|
|
2
|
+
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
3
|
+
|
|
4
|
+
set( target arm64-apple-darwin-macho )
|
|
5
|
+
|
|
6
|
+
set( CMAKE_C_COMPILER clang )
|
|
7
|
+
set( CMAKE_CXX_COMPILER clang++ )
|
|
8
|
+
|
|
9
|
+
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
10
|
+
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
11
|
+
|
|
12
|
+
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
13
|
+
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
|
14
|
+
|
|
15
|
+
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
16
|
+
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|