@fugood/llama.node 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +32 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +16 -0
- package/src/LlamaContext.cpp +44 -2
package/CMakeLists.txt
CHANGED
|
@@ -7,7 +7,7 @@ project (llama-node)
|
|
|
7
7
|
set(CMAKE_CXX_STANDARD 17)
|
|
8
8
|
|
|
9
9
|
execute_process(COMMAND
|
|
10
|
-
git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/
|
|
10
|
+
git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch
|
|
11
11
|
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
12
12
|
)
|
|
13
13
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
|
@@ -9,13 +9,34 @@ export type LlamaModelOptions = {
|
|
|
9
9
|
model: string
|
|
10
10
|
embedding?: boolean
|
|
11
11
|
embd_normalize?: number
|
|
12
|
-
pooling_type?:
|
|
12
|
+
pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
|
|
13
13
|
n_ctx?: number
|
|
14
14
|
n_batch?: number
|
|
15
|
+
n_ubatch?: number
|
|
15
16
|
n_threads?: number
|
|
16
17
|
n_gpu_layers?: number
|
|
18
|
+
flash_attn?: boolean
|
|
19
|
+
cache_type_k?:
|
|
20
|
+
| 'f16'
|
|
21
|
+
| 'f32'
|
|
22
|
+
| 'q8_0'
|
|
23
|
+
| 'q4_0'
|
|
24
|
+
| 'q4_1'
|
|
25
|
+
| 'iq4_nl'
|
|
26
|
+
| 'q5_0'
|
|
27
|
+
| 'q5_1'
|
|
28
|
+
cache_type_v?:
|
|
29
|
+
| 'f16'
|
|
30
|
+
| 'f32'
|
|
31
|
+
| 'q8_0'
|
|
32
|
+
| 'q4_0'
|
|
33
|
+
| 'q4_1'
|
|
34
|
+
| 'iq4_nl'
|
|
35
|
+
| 'q5_0'
|
|
36
|
+
| 'q5_1'
|
|
17
37
|
use_mlock?: boolean
|
|
18
38
|
use_mmap?: boolean
|
|
39
|
+
vocab_only?: boolean
|
|
19
40
|
}
|
|
20
41
|
|
|
21
42
|
export type LlamaCompletionOptions = {
|
|
@@ -53,6 +74,16 @@ export type LlamaCompletionResult = {
|
|
|
53
74
|
tokens_predicted: number
|
|
54
75
|
tokens_evaluated: number
|
|
55
76
|
truncated: boolean
|
|
77
|
+
timings: {
|
|
78
|
+
prompt_n: number
|
|
79
|
+
prompt_ms: number
|
|
80
|
+
prompt_per_token_ms: number
|
|
81
|
+
prompt_per_second: number
|
|
82
|
+
predicted_n: number
|
|
83
|
+
predicted_ms: number
|
|
84
|
+
predicted_per_token_ms: number
|
|
85
|
+
predicted_per_second: number
|
|
86
|
+
}
|
|
56
87
|
}
|
|
57
88
|
|
|
58
89
|
export type LlamaCompletionToken = {
|
package/package.json
CHANGED
|
@@ -159,6 +159,22 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
159
159
|
Napi::Boolean::New(Napi::AsyncWorker::Env(), _result.truncated));
|
|
160
160
|
result.Set("text",
|
|
161
161
|
Napi::String::New(Napi::AsyncWorker::Env(), _result.text.c_str()));
|
|
162
|
+
|
|
163
|
+
auto ctx = _sess->context();
|
|
164
|
+
const auto timings_token = llama_perf_context(ctx);
|
|
165
|
+
|
|
166
|
+
auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
|
|
167
|
+
timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
|
|
168
|
+
timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
|
|
169
|
+
timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
|
|
170
|
+
timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
|
|
171
|
+
timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
|
|
172
|
+
timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
|
|
173
|
+
timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
|
|
174
|
+
timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
|
|
175
|
+
|
|
176
|
+
result.Set("timings", timingsResult);
|
|
177
|
+
|
|
162
178
|
Napi::Promise::Deferred::Resolve(result);
|
|
163
179
|
}
|
|
164
180
|
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "ggml.h"
|
|
1
2
|
#include "LlamaContext.h"
|
|
2
3
|
#include "DetokenizeWorker.h"
|
|
3
4
|
#include "DisposeWorker.h"
|
|
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
60
61
|
exports.Set("LlamaContext", func);
|
|
61
62
|
}
|
|
62
63
|
|
|
64
|
+
const std::vector<ggml_type> kv_cache_types = {
|
|
65
|
+
GGML_TYPE_F32,
|
|
66
|
+
GGML_TYPE_F16,
|
|
67
|
+
GGML_TYPE_BF16,
|
|
68
|
+
GGML_TYPE_Q8_0,
|
|
69
|
+
GGML_TYPE_Q4_0,
|
|
70
|
+
GGML_TYPE_Q4_1,
|
|
71
|
+
GGML_TYPE_IQ4_NL,
|
|
72
|
+
GGML_TYPE_Q5_0,
|
|
73
|
+
GGML_TYPE_Q5_1,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
77
|
+
for (const auto & type : kv_cache_types) {
|
|
78
|
+
if (ggml_type_name(type) == s) {
|
|
79
|
+
return type;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
static int32_t pooling_type_from_str(const std::string & s) {
|
|
86
|
+
if (s == "none") return LLAMA_POOLING_TYPE_NONE;
|
|
87
|
+
if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
|
|
88
|
+
if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
|
|
89
|
+
if (s == "last") return LLAMA_POOLING_TYPE_LAST;
|
|
90
|
+
if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
|
|
91
|
+
return LLAMA_POOLING_TYPE_UNSPECIFIED;
|
|
92
|
+
}
|
|
93
|
+
|
|
63
94
|
// construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
|
|
64
95
|
// use_mlock, use_mmap }): LlamaContext throws error
|
|
65
96
|
LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
@@ -76,20 +107,31 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
76
107
|
Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
|
|
77
108
|
}
|
|
78
109
|
|
|
110
|
+
params.vocab_only = get_option<bool>(options, "vocab_only", false);
|
|
111
|
+
if (params.vocab_only) {
|
|
112
|
+
params.warmup = false;
|
|
113
|
+
}
|
|
114
|
+
|
|
79
115
|
params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
|
|
80
116
|
params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
|
|
117
|
+
params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
|
|
81
118
|
params.embedding = get_option<bool>(options, "embedding", false);
|
|
82
119
|
if (params.embedding) {
|
|
83
120
|
// For non-causal models, batch size must be equal to ubatch size
|
|
84
121
|
params.n_ubatch = params.n_batch;
|
|
85
122
|
}
|
|
86
123
|
params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
|
|
87
|
-
|
|
88
|
-
|
|
124
|
+
params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
|
|
125
|
+
get_option<std::string>(options, "pooling_type", "").c_str()
|
|
126
|
+
);
|
|
89
127
|
|
|
90
128
|
params.cpuparams.n_threads =
|
|
91
129
|
get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
|
|
92
130
|
params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
|
|
131
|
+
params.flash_attn = get_option<bool>(options, "flash_attn", false);
|
|
132
|
+
params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
|
|
133
|
+
params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
|
|
134
|
+
|
|
93
135
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
94
136
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
95
137
|
params.numa =
|