@fugood/llama.node 1.0.0-beta.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +12 -0
- package/lib/index.js +10 -0
- package/lib/index.ts +17 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +49 -6
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/common.hpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
package/CMakeLists.txt
CHANGED
package/lib/binding.ts
CHANGED
|
@@ -79,6 +79,8 @@ export type LlamaCompletionOptions = {
|
|
|
79
79
|
tools?: object
|
|
80
80
|
parallel_tool_calls?: boolean
|
|
81
81
|
tool_choice?: string
|
|
82
|
+
enable_thinking?: boolean
|
|
83
|
+
thinking_forced_open?: boolean
|
|
82
84
|
prompt?: string
|
|
83
85
|
temperature?: number
|
|
84
86
|
top_k?: number
|
|
@@ -157,6 +159,15 @@ export type EmbeddingResult = {
|
|
|
157
159
|
embedding: Float32Array
|
|
158
160
|
}
|
|
159
161
|
|
|
162
|
+
export type RerankParams = {
|
|
163
|
+
normalize?: number
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
export type RerankResult = {
|
|
167
|
+
score: number
|
|
168
|
+
index: number
|
|
169
|
+
}
|
|
170
|
+
|
|
160
171
|
export interface LlamaContext {
|
|
161
172
|
new (options: LlamaModelOptions): LlamaContext
|
|
162
173
|
getSystemInfo(): string
|
|
@@ -180,6 +191,7 @@ export interface LlamaContext {
|
|
|
180
191
|
tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
|
|
181
192
|
detokenize(tokens: number[]): Promise<string>
|
|
182
193
|
embedding(text: string): Promise<EmbeddingResult>
|
|
194
|
+
rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
|
|
183
195
|
saveSession(path: string): Promise<void>
|
|
184
196
|
loadSession(path: string): Promise<void>
|
|
185
197
|
release(): Promise<void>
|
package/lib/index.js
CHANGED
|
@@ -131,6 +131,7 @@ class LlamaContextWrapper {
|
|
|
131
131
|
};
|
|
132
132
|
}
|
|
133
133
|
getFormattedChat(messages, template, params) {
|
|
134
|
+
var _a;
|
|
134
135
|
const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
|
|
135
136
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
136
137
|
let tmpl;
|
|
@@ -143,6 +144,7 @@ class LlamaContextWrapper {
|
|
|
143
144
|
tools: params === null || params === void 0 ? void 0 : params.tools,
|
|
144
145
|
parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
|
|
145
146
|
tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
|
|
147
|
+
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
146
148
|
});
|
|
147
149
|
if (!useJinja) {
|
|
148
150
|
return {
|
|
@@ -174,6 +176,14 @@ class LlamaContextWrapper {
|
|
|
174
176
|
embedding(text) {
|
|
175
177
|
return this.ctx.embedding(text);
|
|
176
178
|
}
|
|
179
|
+
rerank(query, documents, params) {
|
|
180
|
+
return this.ctx.rerank(query, documents, params).then((results) => {
|
|
181
|
+
// Sort by score descending and add document text for convenience
|
|
182
|
+
return results
|
|
183
|
+
.map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
|
|
184
|
+
.sort((a, b) => b.score - a.score);
|
|
185
|
+
});
|
|
186
|
+
}
|
|
177
187
|
saveSession(path) {
|
|
178
188
|
return this.ctx.saveSession(path);
|
|
179
189
|
}
|
package/lib/index.ts
CHANGED
|
@@ -9,6 +9,8 @@ import type {
|
|
|
9
9
|
LlamaCompletionResult,
|
|
10
10
|
TokenizeResult,
|
|
11
11
|
EmbeddingResult,
|
|
12
|
+
RerankParams,
|
|
13
|
+
RerankResult,
|
|
12
14
|
CompletionResponseFormat,
|
|
13
15
|
} from './binding'
|
|
14
16
|
|
|
@@ -158,7 +160,8 @@ class LlamaContextWrapper {
|
|
|
158
160
|
response_format?: CompletionResponseFormat
|
|
159
161
|
tools?: object
|
|
160
162
|
parallel_tool_calls?: object
|
|
161
|
-
tool_choice?: string
|
|
163
|
+
tool_choice?: string,
|
|
164
|
+
enable_thinking?: boolean,
|
|
162
165
|
},
|
|
163
166
|
): FormattedChatResult {
|
|
164
167
|
const {
|
|
@@ -178,6 +181,7 @@ class LlamaContextWrapper {
|
|
|
178
181
|
tools: params?.tools,
|
|
179
182
|
parallel_tool_calls: params?.parallel_tool_calls,
|
|
180
183
|
tool_choice: params?.tool_choice,
|
|
184
|
+
enable_thinking: params?.enable_thinking ?? true,
|
|
181
185
|
})
|
|
182
186
|
|
|
183
187
|
if (!useJinja) {
|
|
@@ -224,6 +228,18 @@ class LlamaContextWrapper {
|
|
|
224
228
|
return this.ctx.embedding(text)
|
|
225
229
|
}
|
|
226
230
|
|
|
231
|
+
rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
|
|
232
|
+
return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
|
|
233
|
+
// Sort by score descending and add document text for convenience
|
|
234
|
+
return results
|
|
235
|
+
.map((result: RerankResult) => ({
|
|
236
|
+
...result,
|
|
237
|
+
document: documents[result.index],
|
|
238
|
+
}))
|
|
239
|
+
.sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
|
|
240
|
+
})
|
|
241
|
+
}
|
|
242
|
+
|
|
227
243
|
saveSession(path: string): Promise<void> {
|
|
228
244
|
return this.ctx.saveSession(path)
|
|
229
245
|
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.0
|
|
4
|
+
"version": "1.0.0",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.0
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.0
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.0
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.0
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.0
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.0
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.0
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.0
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.0
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.0
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.0
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.0
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.0
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.0",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.0",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.0",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.0",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.0",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.0",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.0",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.0",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.0",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.0",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.0",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.0",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.0"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
package/src/EmbeddingWorker.cpp
CHANGED
|
@@ -8,7 +8,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
|
8
8
|
_params(params) {}
|
|
9
9
|
|
|
10
10
|
void EmbeddingWorker::Execute() {
|
|
11
|
-
|
|
11
|
+
llama_memory_clear(llama_get_memory(_sess->context()), true);
|
|
12
12
|
auto tokens = ::common_tokenize(_sess->context(), _text, true);
|
|
13
13
|
// add SEP if not present
|
|
14
14
|
auto vocab = llama_model_get_vocab(_sess->model());
|
|
@@ -29,11 +29,13 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
29
29
|
common_params params,
|
|
30
30
|
std::vector<std::string> stop_words,
|
|
31
31
|
int32_t chat_format,
|
|
32
|
+
bool thinking_forced_open,
|
|
32
33
|
std::string reasoning_format,
|
|
33
34
|
const std::vector<std::string> &media_paths,
|
|
34
35
|
const std::vector<llama_token> &guide_tokens)
|
|
35
36
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
36
37
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
38
|
+
_thinking_forced_open(thinking_forced_open),
|
|
37
39
|
_reasoning_format(reasoning_format),
|
|
38
40
|
_media_paths(media_paths), _guide_tokens(guide_tokens) {
|
|
39
41
|
if (!callback.IsEmpty()) {
|
|
@@ -113,7 +115,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
113
115
|
--n_cur;
|
|
114
116
|
}
|
|
115
117
|
n_input -= n_cur;
|
|
116
|
-
|
|
118
|
+
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_cur, -1);
|
|
117
119
|
}
|
|
118
120
|
// Set the tokens
|
|
119
121
|
_sess->set_tokens(std::move(prompt_tokens));
|
|
@@ -135,8 +137,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
135
137
|
const int n_left = n_cur - n_keep - 1;
|
|
136
138
|
const int n_discard = n_left / 2;
|
|
137
139
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
+
auto mem = llama_get_memory(ctx);
|
|
141
|
+
llama_memory_seq_rm(mem, 0, n_keep + 1, n_keep + n_discard + 1);
|
|
142
|
+
llama_memory_seq_add(mem, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
|
|
140
143
|
|
|
141
144
|
// shift the tokens
|
|
142
145
|
embd->insert(embd->begin() + n_keep + 1,
|
|
@@ -240,6 +243,7 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
240
243
|
try {
|
|
241
244
|
common_chat_syntax chat_syntax;
|
|
242
245
|
chat_syntax.format = static_cast<common_chat_format>(_chat_format);
|
|
246
|
+
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
243
247
|
|
|
244
248
|
if (_reasoning_format == "deepseek") {
|
|
245
249
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
@@ -20,6 +20,7 @@ public:
|
|
|
20
20
|
Napi::Function callback, common_params params,
|
|
21
21
|
std::vector<std::string> stop_words,
|
|
22
22
|
int32_t chat_format,
|
|
23
|
+
bool thinking_forced_open,
|
|
23
24
|
std::string reasoning_format,
|
|
24
25
|
const std::vector<std::string> &media_paths = {},
|
|
25
26
|
const std::vector<llama_token> &guide_tokens = {});
|
|
@@ -42,6 +43,7 @@ private:
|
|
|
42
43
|
common_params _params;
|
|
43
44
|
std::vector<std::string> _stop_words;
|
|
44
45
|
int32_t _chat_format;
|
|
46
|
+
bool _thinking_forced_open;
|
|
45
47
|
std::string _reasoning_format;
|
|
46
48
|
std::vector<std::string> _media_paths;
|
|
47
49
|
std::vector<llama_token> _guide_tokens;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "DetokenizeWorker.h"
|
|
4
4
|
#include "DisposeWorker.h"
|
|
5
5
|
#include "EmbeddingWorker.h"
|
|
6
|
+
#include "RerankWorker.h"
|
|
6
7
|
#include "LlamaCompletionWorker.h"
|
|
7
8
|
#include "LoadSessionWorker.h"
|
|
8
9
|
#include "SaveSessionWorker.h"
|
|
@@ -110,6 +111,8 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
110
111
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
111
112
|
InstanceMethod<&LlamaContext::Embedding>(
|
|
112
113
|
"embedding", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
114
|
+
InstanceMethod<&LlamaContext::Rerank>(
|
|
115
|
+
"rerank", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
113
116
|
InstanceMethod<&LlamaContext::SaveSession>(
|
|
114
117
|
"saveSession",
|
|
115
118
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
@@ -499,7 +502,9 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
499
502
|
const common_chat_templates_ptr &templates, const std::string &messages,
|
|
500
503
|
const std::string &chat_template, const std::string &json_schema,
|
|
501
504
|
const std::string &tools, const bool ¶llel_tool_calls,
|
|
502
|
-
const std::string &tool_choice
|
|
505
|
+
const std::string &tool_choice,
|
|
506
|
+
const bool &enable_thinking
|
|
507
|
+
) {
|
|
503
508
|
common_chat_templates_inputs inputs;
|
|
504
509
|
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
|
505
510
|
auto useTools = !tools.empty();
|
|
@@ -513,6 +518,7 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
513
518
|
if (!json_schema.empty()) {
|
|
514
519
|
inputs.json_schema = json::parse(json_schema);
|
|
515
520
|
}
|
|
521
|
+
inputs.enable_thinking = enable_thinking;
|
|
516
522
|
|
|
517
523
|
// If chat_template is provided, create new one and use it (probably slow)
|
|
518
524
|
if (!chat_template.empty()) {
|
|
@@ -586,12 +592,11 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
586
592
|
auto parallel_tool_calls =
|
|
587
593
|
get_option<bool>(params, "parallel_tool_calls", false);
|
|
588
594
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
595
|
+
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
589
596
|
|
|
590
597
|
auto chatParams = getFormattedChatWithJinja(
|
|
591
598
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
592
|
-
parallel_tool_calls, tool_choice);
|
|
593
|
-
|
|
594
|
-
console_log(env, std::string("format: ") + std::to_string(chatParams.format));
|
|
599
|
+
parallel_tool_calls, tool_choice, enable_thinking);
|
|
595
600
|
|
|
596
601
|
Napi::Object result = Napi::Object::New(env);
|
|
597
602
|
result.Set("prompt", chatParams.prompt);
|
|
@@ -612,6 +617,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
612
617
|
grammar_triggers.Set(i, triggerObj);
|
|
613
618
|
}
|
|
614
619
|
result.Set("grammar_triggers", grammar_triggers);
|
|
620
|
+
result.Set("thinking_forced_open", chatParams.thinking_forced_open);
|
|
615
621
|
// preserved_tokens: string[]
|
|
616
622
|
Napi::Array preserved_tokens = Napi::Array::New(env);
|
|
617
623
|
for (size_t i = 0; i < chatParams.preserved_tokens.size(); i++) {
|
|
@@ -685,6 +691,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
685
691
|
}
|
|
686
692
|
|
|
687
693
|
int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
|
|
694
|
+
bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
|
|
688
695
|
std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
|
|
689
696
|
|
|
690
697
|
common_params params = _sess->params();
|
|
@@ -793,14 +800,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
793
800
|
get_option<bool>(options, "parallel_tool_calls", false);
|
|
794
801
|
auto tool_choice =
|
|
795
802
|
get_option<std::string>(options, "tool_choice", "none");
|
|
803
|
+
auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
|
|
796
804
|
|
|
797
805
|
auto chatParams = getFormattedChatWithJinja(
|
|
798
806
|
_sess, _templates, json_stringify(messages), chat_template,
|
|
799
|
-
json_schema_str, tools_str, parallel_tool_calls, tool_choice);
|
|
807
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
|
|
800
808
|
|
|
801
809
|
params.prompt = chatParams.prompt;
|
|
802
810
|
|
|
803
811
|
chat_format = chatParams.format;
|
|
812
|
+
thinking_forced_open = chatParams.thinking_forced_open;
|
|
804
813
|
|
|
805
814
|
for (const auto &token : chatParams.preserved_tokens) {
|
|
806
815
|
auto ids =
|
|
@@ -895,7 +904,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
895
904
|
|
|
896
905
|
auto *worker =
|
|
897
906
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
|
|
898
|
-
chat_format, reasoning_format, media_paths, guide_tokens);
|
|
907
|
+
chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
|
|
899
908
|
worker->Queue();
|
|
900
909
|
_wip = worker;
|
|
901
910
|
worker->OnComplete([this]() { _wip = nullptr; });
|
|
@@ -976,6 +985,40 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
|
|
|
976
985
|
return worker->Promise();
|
|
977
986
|
}
|
|
978
987
|
|
|
988
|
+
// rerank(query: string, documents: string[], params?: object): Promise<RerankResult[]>
|
|
989
|
+
Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
|
|
990
|
+
Napi::Env env = info.Env();
|
|
991
|
+
if (info.Length() < 2 || !info[0].IsString() || !info[1].IsArray()) {
|
|
992
|
+
Napi::TypeError::New(env, "Query string and documents array expected").ThrowAsJavaScriptException();
|
|
993
|
+
}
|
|
994
|
+
if (_sess == nullptr) {
|
|
995
|
+
Napi::TypeError::New(env, "Context is disposed")
|
|
996
|
+
.ThrowAsJavaScriptException();
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
auto query = info[0].ToString().Utf8Value();
|
|
1000
|
+
auto documents_array = info[1].As<Napi::Array>();
|
|
1001
|
+
|
|
1002
|
+
// Convert documents array to vector
|
|
1003
|
+
std::vector<std::string> documents;
|
|
1004
|
+
for (size_t i = 0; i < documents_array.Length(); i++) {
|
|
1005
|
+
documents.push_back(documents_array.Get(i).ToString().Utf8Value());
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
auto options = Napi::Object::New(env);
|
|
1009
|
+
if (info.Length() >= 3 && info[2].IsObject()) {
|
|
1010
|
+
options = info[2].As<Napi::Object>();
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
common_params rerankParams;
|
|
1014
|
+
rerankParams.embedding = true;
|
|
1015
|
+
rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
|
|
1016
|
+
|
|
1017
|
+
auto *worker = new RerankWorker(info, _sess, query, documents, rerankParams);
|
|
1018
|
+
worker->Queue();
|
|
1019
|
+
return worker->Promise();
|
|
1020
|
+
}
|
|
1021
|
+
|
|
979
1022
|
// saveSession(path: string): Promise<void> throws error
|
|
980
1023
|
Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
|
|
981
1024
|
Napi::Env env = info.Env();
|
package/src/LlamaContext.h
CHANGED
|
@@ -28,6 +28,7 @@ private:
|
|
|
28
28
|
Napi::Value Tokenize(const Napi::CallbackInfo &info);
|
|
29
29
|
Napi::Value Detokenize(const Napi::CallbackInfo &info);
|
|
30
30
|
Napi::Value Embedding(const Napi::CallbackInfo &info);
|
|
31
|
+
Napi::Value Rerank(const Napi::CallbackInfo &info);
|
|
31
32
|
Napi::Value SaveSession(const Napi::CallbackInfo &info);
|
|
32
33
|
Napi::Value LoadSession(const Napi::CallbackInfo &info);
|
|
33
34
|
void ApplyLoraAdapters(const Napi::CallbackInfo &info);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
struct RerankResult {
|
|
5
|
+
std::vector<float> scores;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
class RerankWorker : public Napi::AsyncWorker,
|
|
9
|
+
public Napi::Promise::Deferred {
|
|
10
|
+
public:
|
|
11
|
+
RerankWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
12
|
+
std::string query, std::vector<std::string> documents,
|
|
13
|
+
common_params ¶ms);
|
|
14
|
+
|
|
15
|
+
protected:
|
|
16
|
+
void Execute();
|
|
17
|
+
void OnOK();
|
|
18
|
+
void OnError(const Napi::Error &err);
|
|
19
|
+
|
|
20
|
+
private:
|
|
21
|
+
LlamaSessionPtr _sess;
|
|
22
|
+
std::string _query;
|
|
23
|
+
std::vector<std::string> _documents;
|
|
24
|
+
common_params _params;
|
|
25
|
+
RerankResult _result;
|
|
26
|
+
};
|
package/src/common.hpp
CHANGED
|
@@ -461,7 +461,7 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
// Clear all KV cache entries after position n_past
|
|
464
|
-
|
|
464
|
+
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
|
|
465
465
|
|
|
466
466
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
467
467
|
|
|
@@ -95,7 +95,7 @@ endif()
|
|
|
95
95
|
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
96
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
97
|
endif()
|
|
98
|
-
set(LLAMA_INSTALL_VERSION 0.0.${
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
99
99
|
|
|
100
100
|
# override ggml options
|
|
101
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
|
45
|
-
class string_view {
|
|
46
|
-
const std::string & _str;
|
|
47
|
-
const size_t _start;
|
|
48
|
-
const size_t _end;
|
|
49
|
-
public:
|
|
50
|
-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
|
51
|
-
|
|
52
|
-
size_t size() const {
|
|
53
|
-
return _end - _start;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
size_t length() const {
|
|
57
|
-
return size();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
operator std::string() const {
|
|
61
|
-
return str();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
std::string str() const {
|
|
65
|
-
return _str.substr(_start, _end - _start);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
|
69
|
-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
char operator[](size_t pos) const {
|
|
73
|
-
auto index = _start + pos;
|
|
74
|
-
if (index >= _end) {
|
|
75
|
-
throw std::out_of_range("string_view index out of range");
|
|
76
|
-
}
|
|
77
|
-
return _str[_start + pos];
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
bool operator==(const string_view & other) const {
|
|
81
|
-
std::string this_str = *this;
|
|
82
|
-
std::string other_str = other;
|
|
83
|
-
return this_str == other_str;
|
|
84
|
-
}
|
|
85
|
-
};
|
|
86
|
-
|
|
87
44
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
88
45
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
|
89
46
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
|
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
112
69
|
}
|
|
113
70
|
out << "}";
|
|
114
71
|
};
|
|
115
|
-
std::function<void(const string_view &, const string_view &)> uniform_range =
|
|
116
|
-
[&](const string_view & from, const string_view & to) {
|
|
72
|
+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
|
|
73
|
+
[&](const std::string_view & from, const std::string_view & to) {
|
|
117
74
|
size_t i = 0;
|
|
118
75
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
|
119
76
|
i++;
|
|
120
77
|
}
|
|
121
78
|
if (i > 0) {
|
|
122
|
-
out << "\"" << from.substr(0, i)
|
|
79
|
+
out << "\"" << from.substr(0, i) << "\"";
|
|
123
80
|
}
|
|
124
81
|
if (i < from.length() && i < to.length()) {
|
|
125
82
|
if (i > 0) {
|
|
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" ON)
|
|
134
135
|
|
|
135
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
136
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -101,6 +101,7 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
105
106
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
106
107
|
|
|
@@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
448
448
|
|
|
449
449
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
450
450
|
if (${S390X_M} MATCHES "8561|8562")
|
|
451
|
+
set(GGML_NNPA OFF)
|
|
451
452
|
message(STATUS "z15 target")
|
|
452
453
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
453
454
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
464
465
|
endif()
|
|
465
466
|
|
|
466
467
|
if (GGML_VXE)
|
|
468
|
+
message(STATUS "VX/VXE/VXE2 enabled")
|
|
467
469
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
470
|
+
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
471
|
+
endif()
|
|
472
|
+
|
|
473
|
+
if (GGML_NNPA)
|
|
474
|
+
message(STATUS "NNPA enabled")
|
|
475
|
+
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
468
476
|
endif()
|
|
469
477
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
470
478
|
message(STATUS "Wasm detected")
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include "mmq.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
10
|
#include "ggml-cpu-impl.h"
|
|
11
|
+
#include "simd-mappings.h"
|
|
11
12
|
#include "quants.h"
|
|
12
13
|
#include "ggml-quants.h"
|
|
13
14
|
#include <algorithm>
|
|
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
|
|
453
454
|
|
|
454
455
|
// Quantize these floats
|
|
455
456
|
const float iscale = 127.f / amax;
|
|
456
|
-
y[i].d =
|
|
457
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
|
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
|
459
460
|
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1091
1092
|
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1095
1096
|
|
|
1096
1097
|
__m512 vsum;
|
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
|
|
1114
1115
|
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1119
1120
|
|
|
1120
1121
|
__m512 vsum;
|
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1138
1139
|
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1142
1143
|
|
|
1143
1144
|
__m512 vsum;
|
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
|
1439
1440
|
}
|
|
1440
|
-
vd1 = _mm512_set1_ps(
|
|
1441
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1441
1442
|
}
|
|
1442
1443
|
|
|
1443
1444
|
// load b
|
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1500
1501
|
}
|
|
1501
|
-
vd1 = _mm512_set1_ps(
|
|
1502
|
-
vs1 = _mm512_set1_ps(
|
|
1502
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1503
|
+
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
|
1503
1504
|
}
|
|
1504
1505
|
|
|
1505
1506
|
// load b
|
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
|
1573
1574
|
}
|
|
1574
|
-
vd1 = _mm512_set1_ps(
|
|
1575
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1575
1576
|
}
|
|
1576
1577
|
|
|
1577
1578
|
// load b
|