@fugood/llama.node 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +21 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +39 -2
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
|
@@ -9,11 +9,31 @@ export type LlamaModelOptions = {
|
|
|
9
9
|
model: string
|
|
10
10
|
embedding?: boolean
|
|
11
11
|
embd_normalize?: number
|
|
12
|
-
pooling_type?:
|
|
12
|
+
pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
|
|
13
13
|
n_ctx?: number
|
|
14
14
|
n_batch?: number
|
|
15
|
+
n_ubatch?: number
|
|
15
16
|
n_threads?: number
|
|
16
17
|
n_gpu_layers?: number
|
|
18
|
+
flash_attn?: boolean
|
|
19
|
+
cache_type_k?:
|
|
20
|
+
| 'f16'
|
|
21
|
+
| 'f32'
|
|
22
|
+
| 'q8_0'
|
|
23
|
+
| 'q4_0'
|
|
24
|
+
| 'q4_1'
|
|
25
|
+
| 'iq4_nl'
|
|
26
|
+
| 'q5_0'
|
|
27
|
+
| 'q5_1'
|
|
28
|
+
cache_type_v?:
|
|
29
|
+
| 'f16'
|
|
30
|
+
| 'f32'
|
|
31
|
+
| 'q8_0'
|
|
32
|
+
| 'q4_0'
|
|
33
|
+
| 'q4_1'
|
|
34
|
+
| 'iq4_nl'
|
|
35
|
+
| 'q5_0'
|
|
36
|
+
| 'q5_1'
|
|
17
37
|
use_mlock?: boolean
|
|
18
38
|
use_mmap?: boolean
|
|
19
39
|
vocab_only?: boolean
|
package/package.json
CHANGED
package/src/LlamaContext.cpp
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "ggml.h"
|
|
1
2
|
#include "LlamaContext.h"
|
|
2
3
|
#include "DetokenizeWorker.h"
|
|
3
4
|
#include "DisposeWorker.h"
|
|
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
60
61
|
exports.Set("LlamaContext", func);
|
|
61
62
|
}
|
|
62
63
|
|
|
64
|
+
const std::vector<ggml_type> kv_cache_types = {
|
|
65
|
+
GGML_TYPE_F32,
|
|
66
|
+
GGML_TYPE_F16,
|
|
67
|
+
GGML_TYPE_BF16,
|
|
68
|
+
GGML_TYPE_Q8_0,
|
|
69
|
+
GGML_TYPE_Q4_0,
|
|
70
|
+
GGML_TYPE_Q4_1,
|
|
71
|
+
GGML_TYPE_IQ4_NL,
|
|
72
|
+
GGML_TYPE_Q5_0,
|
|
73
|
+
GGML_TYPE_Q5_1,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
77
|
+
for (const auto & type : kv_cache_types) {
|
|
78
|
+
if (ggml_type_name(type) == s) {
|
|
79
|
+
return type;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
static int32_t pooling_type_from_str(const std::string & s) {
|
|
86
|
+
if (s == "none") return LLAMA_POOLING_TYPE_NONE;
|
|
87
|
+
if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
|
|
88
|
+
if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
|
|
89
|
+
if (s == "last") return LLAMA_POOLING_TYPE_LAST;
|
|
90
|
+
if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
|
|
91
|
+
return LLAMA_POOLING_TYPE_UNSPECIFIED;
|
|
92
|
+
}
|
|
93
|
+
|
|
63
94
|
// construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
|
|
64
95
|
// use_mlock, use_mmap }): LlamaContext throws error
|
|
65
96
|
LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
@@ -83,18 +114,24 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
83
114
|
|
|
84
115
|
params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
|
|
85
116
|
params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
|
|
117
|
+
params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
|
|
86
118
|
params.embedding = get_option<bool>(options, "embedding", false);
|
|
87
119
|
if (params.embedding) {
|
|
88
120
|
// For non-causal models, batch size must be equal to ubatch size
|
|
89
121
|
params.n_ubatch = params.n_batch;
|
|
90
122
|
}
|
|
91
123
|
params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
|
|
92
|
-
|
|
93
|
-
|
|
124
|
+
params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
|
|
125
|
+
get_option<std::string>(options, "pooling_type", "").c_str()
|
|
126
|
+
);
|
|
94
127
|
|
|
95
128
|
params.cpuparams.n_threads =
|
|
96
129
|
get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
|
|
97
130
|
params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
|
|
131
|
+
params.flash_attn = get_option<bool>(options, "flash_attn", false);
|
|
132
|
+
params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
|
|
133
|
+
params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
|
|
134
|
+
|
|
98
135
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
99
136
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
100
137
|
params.numa =
|