@fugood/llama.node 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -7,7 +7,7 @@ project (llama-node)
7
7
  set(CMAKE_CXX_STANDARD 17)
8
8
 
9
9
  execute_process(COMMAND
10
- git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/ggml-cpu-CMakeLists.txt.patch
10
+ git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch
11
11
  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
12
12
  )
13
13
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -9,13 +9,34 @@ export type LlamaModelOptions = {
9
9
  model: string
10
10
  embedding?: boolean
11
11
  embd_normalize?: number
12
- pooling_type?: number
12
+ pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
13
13
  n_ctx?: number
14
14
  n_batch?: number
15
+ n_ubatch?: number
15
16
  n_threads?: number
16
17
  n_gpu_layers?: number
18
+ flash_attn?: boolean
19
+ cache_type_k?:
20
+ | 'f16'
21
+ | 'f32'
22
+ | 'q8_0'
23
+ | 'q4_0'
24
+ | 'q4_1'
25
+ | 'iq4_nl'
26
+ | 'q5_0'
27
+ | 'q5_1'
28
+ cache_type_v?:
29
+ | 'f16'
30
+ | 'f32'
31
+ | 'q8_0'
32
+ | 'q4_0'
33
+ | 'q4_1'
34
+ | 'iq4_nl'
35
+ | 'q5_0'
36
+ | 'q5_1'
17
37
  use_mlock?: boolean
18
38
  use_mmap?: boolean
39
+ vocab_only?: boolean
19
40
  }
20
41
 
21
42
  export type LlamaCompletionOptions = {
@@ -53,6 +74,16 @@ export type LlamaCompletionResult = {
53
74
  tokens_predicted: number
54
75
  tokens_evaluated: number
55
76
  truncated: boolean
77
+ timings: {
78
+ prompt_n: number
79
+ prompt_ms: number
80
+ prompt_per_token_ms: number
81
+ prompt_per_second: number
82
+ predicted_n: number
83
+ predicted_ms: number
84
+ predicted_per_token_ms: number
85
+ predicted_per_second: number
86
+ }
56
87
  }
57
88
 
58
89
  export type LlamaCompletionToken = {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.4",
4
+ "version": "0.3.6",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -159,6 +159,22 @@ void LlamaCompletionWorker::OnOK() {
159
159
  Napi::Boolean::New(Napi::AsyncWorker::Env(), _result.truncated));
160
160
  result.Set("text",
161
161
  Napi::String::New(Napi::AsyncWorker::Env(), _result.text.c_str()));
162
+
163
+ auto ctx = _sess->context();
164
+ const auto timings_token = llama_perf_context(ctx);
165
+
166
+ auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
167
+ timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
168
+ timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
169
+ timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
170
+ timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
171
+ timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
172
+ timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
173
+ timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
174
+ timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
175
+
176
+ result.Set("timings", timingsResult);
177
+
162
178
  Napi::Promise::Deferred::Resolve(result);
163
179
  }
164
180
 
@@ -1,3 +1,4 @@
1
+ #include "ggml.h"
1
2
  #include "LlamaContext.h"
2
3
  #include "DetokenizeWorker.h"
3
4
  #include "DisposeWorker.h"
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
60
61
  exports.Set("LlamaContext", func);
61
62
  }
62
63
 
64
+ const std::vector<ggml_type> kv_cache_types = {
65
+ GGML_TYPE_F32,
66
+ GGML_TYPE_F16,
67
+ GGML_TYPE_BF16,
68
+ GGML_TYPE_Q8_0,
69
+ GGML_TYPE_Q4_0,
70
+ GGML_TYPE_Q4_1,
71
+ GGML_TYPE_IQ4_NL,
72
+ GGML_TYPE_Q5_0,
73
+ GGML_TYPE_Q5_1,
74
+ };
75
+
76
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
77
+ for (const auto & type : kv_cache_types) {
78
+ if (ggml_type_name(type) == s) {
79
+ return type;
80
+ }
81
+ }
82
+ throw std::runtime_error("Unsupported cache type: " + s);
83
+ }
84
+
85
+ static int32_t pooling_type_from_str(const std::string & s) {
86
+ if (s == "none") return LLAMA_POOLING_TYPE_NONE;
87
+ if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
88
+ if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
89
+ if (s == "last") return LLAMA_POOLING_TYPE_LAST;
90
+ if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
91
+ return LLAMA_POOLING_TYPE_UNSPECIFIED;
92
+ }
93
+
63
94
  // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
64
95
  // use_mlock, use_mmap }): LlamaContext throws error
65
96
  LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
@@ -76,20 +107,31 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
76
107
  Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
77
108
  }
78
109
 
110
+ params.vocab_only = get_option<bool>(options, "vocab_only", false);
111
+ if (params.vocab_only) {
112
+ params.warmup = false;
113
+ }
114
+
79
115
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
80
116
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
117
+ params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
81
118
  params.embedding = get_option<bool>(options, "embedding", false);
82
119
  if (params.embedding) {
83
120
  // For non-causal models, batch size must be equal to ubatch size
84
121
  params.n_ubatch = params.n_batch;
85
122
  }
86
123
  params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
87
- int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
88
- params.pooling_type = (enum llama_pooling_type) pooling_type;
124
+ params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
125
+ get_option<std::string>(options, "pooling_type", "").c_str()
126
+ );
89
127
 
90
128
  params.cpuparams.n_threads =
91
129
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
92
130
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
131
+ params.flash_attn = get_option<bool>(options, "flash_attn", false);
132
+ params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
133
+ params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
134
+
93
135
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
94
136
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
95
137
  params.numa =