@fugood/llama.node 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -9,11 +9,31 @@ export type LlamaModelOptions = {
9
9
  model: string
10
10
  embedding?: boolean
11
11
  embd_normalize?: number
12
- pooling_type?: number
12
+ pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
13
13
  n_ctx?: number
14
14
  n_batch?: number
15
+ n_ubatch?: number
15
16
  n_threads?: number
16
17
  n_gpu_layers?: number
18
+ flash_attn?: boolean
19
+ cache_type_k?:
20
+ | 'f16'
21
+ | 'f32'
22
+ | 'q8_0'
23
+ | 'q4_0'
24
+ | 'q4_1'
25
+ | 'iq4_nl'
26
+ | 'q5_0'
27
+ | 'q5_1'
28
+ cache_type_v?:
29
+ | 'f16'
30
+ | 'f32'
31
+ | 'q8_0'
32
+ | 'q4_0'
33
+ | 'q4_1'
34
+ | 'iq4_nl'
35
+ | 'q5_0'
36
+ | 'q5_1'
17
37
  use_mlock?: boolean
18
38
  use_mmap?: boolean
19
39
  vocab_only?: boolean
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.5",
4
+ "version": "0.3.6",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -1,3 +1,4 @@
1
+ #include "ggml.h"
1
2
  #include "LlamaContext.h"
2
3
  #include "DetokenizeWorker.h"
3
4
  #include "DisposeWorker.h"
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
60
61
  exports.Set("LlamaContext", func);
61
62
  }
62
63
 
64
+ const std::vector<ggml_type> kv_cache_types = {
65
+ GGML_TYPE_F32,
66
+ GGML_TYPE_F16,
67
+ GGML_TYPE_BF16,
68
+ GGML_TYPE_Q8_0,
69
+ GGML_TYPE_Q4_0,
70
+ GGML_TYPE_Q4_1,
71
+ GGML_TYPE_IQ4_NL,
72
+ GGML_TYPE_Q5_0,
73
+ GGML_TYPE_Q5_1,
74
+ };
75
+
76
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
77
+ for (const auto & type : kv_cache_types) {
78
+ if (ggml_type_name(type) == s) {
79
+ return type;
80
+ }
81
+ }
82
+ throw std::runtime_error("Unsupported cache type: " + s);
83
+ }
84
+
85
+ static int32_t pooling_type_from_str(const std::string & s) {
86
+ if (s == "none") return LLAMA_POOLING_TYPE_NONE;
87
+ if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
88
+ if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
89
+ if (s == "last") return LLAMA_POOLING_TYPE_LAST;
90
+ if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
91
+ return LLAMA_POOLING_TYPE_UNSPECIFIED;
92
+ }
93
+
63
94
  // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
64
95
  // use_mlock, use_mmap }): LlamaContext throws error
65
96
  LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
@@ -83,18 +114,24 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
83
114
 
84
115
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
85
116
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
117
+ params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
86
118
  params.embedding = get_option<bool>(options, "embedding", false);
87
119
  if (params.embedding) {
88
120
  // For non-causal models, batch size must be equal to ubatch size
89
121
  params.n_ubatch = params.n_batch;
90
122
  }
91
123
  params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
92
- int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
93
- params.pooling_type = (enum llama_pooling_type) pooling_type;
124
+ params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
125
+ get_option<std::string>(options, "pooling_type", "").c_str()
126
+ );
94
127
 
95
128
  params.cpuparams.n_threads =
96
129
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
97
130
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
131
+ params.flash_attn = get_option<bool>(options, "flash_attn", false);
132
+ params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
133
+ params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
134
+
98
135
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
99
136
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
100
137
  params.numa =