@fugood/llama.node 1.0.0-beta.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +12 -0
  3. package/lib/index.js +10 -0
  4. package/lib/index.ts +17 -1
  5. package/package.json +14 -14
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +7 -3
  8. package/src/LlamaCompletionWorker.h +2 -0
  9. package/src/LlamaContext.cpp +49 -6
  10. package/src/LlamaContext.h +1 -0
  11. package/src/RerankWorker.h +26 -0
  12. package/src/common.hpp +1 -1
  13. package/src/llama.cpp/CMakeLists.txt +1 -1
  14. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  15. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  16. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  28. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  29. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  35. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  40. package/src/llama.cpp/include/llama.h +6 -3
  41. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  42. package/src/llama.cpp/src/llama-arch.h +17 -0
  43. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  44. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  45. package/src/llama.cpp/src/llama-context.cpp +0 -1
  46. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  47. package/src/llama.cpp/src/llama-graph.h +14 -2
  48. package/src/llama.cpp/src/llama-hparams.h +6 -0
  49. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  50. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  51. package/src/llama.cpp/src/llama-model.cpp +518 -1
  52. package/src/llama.cpp/src/llama-model.h +22 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +87 -5
package/CMakeLists.txt CHANGED
@@ -140,6 +140,8 @@ file(
140
140
  "src/DetokenizeWorker.h"
141
141
  "src/EmbeddingWorker.cpp"
142
142
  "src/EmbeddingWorker.h"
143
+ "src/RerankWorker.cpp"
144
+ "src/RerankWorker.h"
143
145
  "src/LoadSessionWorker.cpp"
144
146
  "src/LoadSessionWorker.h"
145
147
  "src/SaveSessionWorker.cpp"
package/lib/binding.ts CHANGED
@@ -79,6 +79,8 @@ export type LlamaCompletionOptions = {
79
79
  tools?: object
80
80
  parallel_tool_calls?: boolean
81
81
  tool_choice?: string
82
+ enable_thinking?: boolean
83
+ thinking_forced_open?: boolean
82
84
  prompt?: string
83
85
  temperature?: number
84
86
  top_k?: number
@@ -157,6 +159,15 @@ export type EmbeddingResult = {
157
159
  embedding: Float32Array
158
160
  }
159
161
 
162
+ export type RerankParams = {
163
+ normalize?: number
164
+ }
165
+
166
+ export type RerankResult = {
167
+ score: number
168
+ index: number
169
+ }
170
+
160
171
  export interface LlamaContext {
161
172
  new (options: LlamaModelOptions): LlamaContext
162
173
  getSystemInfo(): string
@@ -180,6 +191,7 @@ export interface LlamaContext {
180
191
  tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
181
192
  detokenize(tokens: number[]): Promise<string>
182
193
  embedding(text: string): Promise<EmbeddingResult>
194
+ rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
183
195
  saveSession(path: string): Promise<void>
184
196
  loadSession(path: string): Promise<void>
185
197
  release(): Promise<void>
package/lib/index.js CHANGED
@@ -131,6 +131,7 @@ class LlamaContextWrapper {
131
131
  };
132
132
  }
133
133
  getFormattedChat(messages, template, params) {
134
+ var _a;
134
135
  const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
135
136
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
136
137
  let tmpl;
@@ -143,6 +144,7 @@ class LlamaContextWrapper {
143
144
  tools: params === null || params === void 0 ? void 0 : params.tools,
144
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
145
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
147
+ enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
146
148
  });
147
149
  if (!useJinja) {
148
150
  return {
@@ -174,6 +176,14 @@ class LlamaContextWrapper {
174
176
  embedding(text) {
175
177
  return this.ctx.embedding(text);
176
178
  }
179
+ rerank(query, documents, params) {
180
+ return this.ctx.rerank(query, documents, params).then((results) => {
181
+ // Sort by score descending and add document text for convenience
182
+ return results
183
+ .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
184
+ .sort((a, b) => b.score - a.score);
185
+ });
186
+ }
177
187
  saveSession(path) {
178
188
  return this.ctx.saveSession(path);
179
189
  }
package/lib/index.ts CHANGED
@@ -9,6 +9,8 @@ import type {
9
9
  LlamaCompletionResult,
10
10
  TokenizeResult,
11
11
  EmbeddingResult,
12
+ RerankParams,
13
+ RerankResult,
12
14
  CompletionResponseFormat,
13
15
  } from './binding'
14
16
 
@@ -158,7 +160,8 @@ class LlamaContextWrapper {
158
160
  response_format?: CompletionResponseFormat
159
161
  tools?: object
160
162
  parallel_tool_calls?: object
161
- tool_choice?: string
163
+ tool_choice?: string,
164
+ enable_thinking?: boolean,
162
165
  },
163
166
  ): FormattedChatResult {
164
167
  const {
@@ -178,6 +181,7 @@ class LlamaContextWrapper {
178
181
  tools: params?.tools,
179
182
  parallel_tool_calls: params?.parallel_tool_calls,
180
183
  tool_choice: params?.tool_choice,
184
+ enable_thinking: params?.enable_thinking ?? true,
181
185
  })
182
186
 
183
187
  if (!useJinja) {
@@ -224,6 +228,18 @@ class LlamaContextWrapper {
224
228
  return this.ctx.embedding(text)
225
229
  }
226
230
 
231
+ rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
232
+ return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
233
+ // Sort by score descending and add document text for convenience
234
+ return results
235
+ .map((result: RerankResult) => ({
236
+ ...result,
237
+ document: documents[result.index],
238
+ }))
239
+ .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
240
+ })
241
+ }
242
+
227
243
  saveSession(path: string): Promise<void> {
228
244
  return this.ctx.saveSession(path)
229
245
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.0-beta.6",
4
+ "version": "1.0.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.0-beta.6",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.6",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.6",
76
- "@fugood/node-llama-linux-arm64": "1.0.0-beta.6",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.6",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.6",
79
- "@fugood/node-llama-win32-x64": "1.0.0-beta.6",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.6",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.6",
82
- "@fugood/node-llama-win32-arm64": "1.0.0-beta.6",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.6",
84
- "@fugood/node-llama-darwin-x64": "1.0.0-beta.6",
85
- "@fugood/node-llama-darwin-arm64": "1.0.0-beta.6"
73
+ "@fugood/node-llama-linux-x64": "1.0.0",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.0",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.0",
76
+ "@fugood/node-llama-linux-arm64": "1.0.0",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.0",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.0",
79
+ "@fugood/node-llama-win32-x64": "1.0.0",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.0",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.0",
82
+ "@fugood/node-llama-win32-arm64": "1.0.0",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.0",
84
+ "@fugood/node-llama-darwin-x64": "1.0.0",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.0"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -8,7 +8,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
8
8
  _params(params) {}
9
9
 
10
10
  void EmbeddingWorker::Execute() {
11
- llama_kv_self_clear(_sess->context());
11
+ llama_memory_clear(llama_get_memory(_sess->context()), true);
12
12
  auto tokens = ::common_tokenize(_sess->context(), _text, true);
13
13
  // add SEP if not present
14
14
  auto vocab = llama_model_get_vocab(_sess->model());
@@ -29,11 +29,13 @@ LlamaCompletionWorker::LlamaCompletionWorker(
29
29
  common_params params,
30
30
  std::vector<std::string> stop_words,
31
31
  int32_t chat_format,
32
+ bool thinking_forced_open,
32
33
  std::string reasoning_format,
33
34
  const std::vector<std::string> &media_paths,
34
35
  const std::vector<llama_token> &guide_tokens)
35
36
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
36
37
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
38
+ _thinking_forced_open(thinking_forced_open),
37
39
  _reasoning_format(reasoning_format),
38
40
  _media_paths(media_paths), _guide_tokens(guide_tokens) {
39
41
  if (!callback.IsEmpty()) {
@@ -113,7 +115,7 @@ void LlamaCompletionWorker::Execute() {
113
115
  --n_cur;
114
116
  }
115
117
  n_input -= n_cur;
116
- llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
118
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_cur, -1);
117
119
  }
118
120
  // Set the tokens
119
121
  _sess->set_tokens(std::move(prompt_tokens));
@@ -135,8 +137,9 @@ void LlamaCompletionWorker::Execute() {
135
137
  const int n_left = n_cur - n_keep - 1;
136
138
  const int n_discard = n_left / 2;
137
139
 
138
- llama_kv_self_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
139
- llama_kv_self_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
140
+ auto mem = llama_get_memory(ctx);
141
+ llama_memory_seq_rm(mem, 0, n_keep + 1, n_keep + n_discard + 1);
142
+ llama_memory_seq_add(mem, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
140
143
 
141
144
  // shift the tokens
142
145
  embd->insert(embd->begin() + n_keep + 1,
@@ -240,6 +243,7 @@ void LlamaCompletionWorker::OnOK() {
240
243
  try {
241
244
  common_chat_syntax chat_syntax;
242
245
  chat_syntax.format = static_cast<common_chat_format>(_chat_format);
246
+ chat_syntax.thinking_forced_open = _thinking_forced_open;
243
247
 
244
248
  if (_reasoning_format == "deepseek") {
245
249
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
@@ -20,6 +20,7 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
+ bool thinking_forced_open,
23
24
  std::string reasoning_format,
24
25
  const std::vector<std::string> &media_paths = {},
25
26
  const std::vector<llama_token> &guide_tokens = {});
@@ -42,6 +43,7 @@ private:
42
43
  common_params _params;
43
44
  std::vector<std::string> _stop_words;
44
45
  int32_t _chat_format;
46
+ bool _thinking_forced_open;
45
47
  std::string _reasoning_format;
46
48
  std::vector<std::string> _media_paths;
47
49
  std::vector<llama_token> _guide_tokens;
@@ -3,6 +3,7 @@
3
3
  #include "DetokenizeWorker.h"
4
4
  #include "DisposeWorker.h"
5
5
  #include "EmbeddingWorker.h"
6
+ #include "RerankWorker.h"
6
7
  #include "LlamaCompletionWorker.h"
7
8
  #include "LoadSessionWorker.h"
8
9
  #include "SaveSessionWorker.h"
@@ -110,6 +111,8 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
110
111
  static_cast<napi_property_attributes>(napi_enumerable)),
111
112
  InstanceMethod<&LlamaContext::Embedding>(
112
113
  "embedding", static_cast<napi_property_attributes>(napi_enumerable)),
114
+ InstanceMethod<&LlamaContext::Rerank>(
115
+ "rerank", static_cast<napi_property_attributes>(napi_enumerable)),
113
116
  InstanceMethod<&LlamaContext::SaveSession>(
114
117
  "saveSession",
115
118
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -499,7 +502,9 @@ common_chat_params getFormattedChatWithJinja(
499
502
  const common_chat_templates_ptr &templates, const std::string &messages,
500
503
  const std::string &chat_template, const std::string &json_schema,
501
504
  const std::string &tools, const bool &parallel_tool_calls,
502
- const std::string &tool_choice) {
505
+ const std::string &tool_choice,
506
+ const bool &enable_thinking
507
+ ) {
503
508
  common_chat_templates_inputs inputs;
504
509
  inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
505
510
  auto useTools = !tools.empty();
@@ -513,6 +518,7 @@ common_chat_params getFormattedChatWithJinja(
513
518
  if (!json_schema.empty()) {
514
519
  inputs.json_schema = json::parse(json_schema);
515
520
  }
521
+ inputs.enable_thinking = enable_thinking;
516
522
 
517
523
  // If chat_template is provided, create new one and use it (probably slow)
518
524
  if (!chat_template.empty()) {
@@ -586,12 +592,11 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
586
592
  auto parallel_tool_calls =
587
593
  get_option<bool>(params, "parallel_tool_calls", false);
588
594
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
595
+ auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
589
596
 
590
597
  auto chatParams = getFormattedChatWithJinja(
591
598
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
592
- parallel_tool_calls, tool_choice);
593
-
594
- console_log(env, std::string("format: ") + std::to_string(chatParams.format));
599
+ parallel_tool_calls, tool_choice, enable_thinking);
595
600
 
596
601
  Napi::Object result = Napi::Object::New(env);
597
602
  result.Set("prompt", chatParams.prompt);
@@ -612,6 +617,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
612
617
  grammar_triggers.Set(i, triggerObj);
613
618
  }
614
619
  result.Set("grammar_triggers", grammar_triggers);
620
+ result.Set("thinking_forced_open", chatParams.thinking_forced_open);
615
621
  // preserved_tokens: string[]
616
622
  Napi::Array preserved_tokens = Napi::Array::New(env);
617
623
  for (size_t i = 0; i < chatParams.preserved_tokens.size(); i++) {
@@ -685,6 +691,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
685
691
  }
686
692
 
687
693
  int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
694
+ bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
688
695
  std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
689
696
 
690
697
  common_params params = _sess->params();
@@ -793,14 +800,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
793
800
  get_option<bool>(options, "parallel_tool_calls", false);
794
801
  auto tool_choice =
795
802
  get_option<std::string>(options, "tool_choice", "none");
803
+ auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
796
804
 
797
805
  auto chatParams = getFormattedChatWithJinja(
798
806
  _sess, _templates, json_stringify(messages), chat_template,
799
- json_schema_str, tools_str, parallel_tool_calls, tool_choice);
807
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
800
808
 
801
809
  params.prompt = chatParams.prompt;
802
810
 
803
811
  chat_format = chatParams.format;
812
+ thinking_forced_open = chatParams.thinking_forced_open;
804
813
 
805
814
  for (const auto &token : chatParams.preserved_tokens) {
806
815
  auto ids =
@@ -895,7 +904,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
895
904
 
896
905
  auto *worker =
897
906
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
898
- chat_format, reasoning_format, media_paths, guide_tokens);
907
+ chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
899
908
  worker->Queue();
900
909
  _wip = worker;
901
910
  worker->OnComplete([this]() { _wip = nullptr; });
@@ -976,6 +985,40 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
976
985
  return worker->Promise();
977
986
  }
978
987
 
988
+ // rerank(query: string, documents: string[], params?: object): Promise<RerankResult[]>
989
+ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
990
+ Napi::Env env = info.Env();
991
+ if (info.Length() < 2 || !info[0].IsString() || !info[1].IsArray()) {
992
+ Napi::TypeError::New(env, "Query string and documents array expected").ThrowAsJavaScriptException();
993
+ }
994
+ if (_sess == nullptr) {
995
+ Napi::TypeError::New(env, "Context is disposed")
996
+ .ThrowAsJavaScriptException();
997
+ }
998
+
999
+ auto query = info[0].ToString().Utf8Value();
1000
+ auto documents_array = info[1].As<Napi::Array>();
1001
+
1002
+ // Convert documents array to vector
1003
+ std::vector<std::string> documents;
1004
+ for (size_t i = 0; i < documents_array.Length(); i++) {
1005
+ documents.push_back(documents_array.Get(i).ToString().Utf8Value());
1006
+ }
1007
+
1008
+ auto options = Napi::Object::New(env);
1009
+ if (info.Length() >= 3 && info[2].IsObject()) {
1010
+ options = info[2].As<Napi::Object>();
1011
+ }
1012
+
1013
+ common_params rerankParams;
1014
+ rerankParams.embedding = true;
1015
+ rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
1016
+
1017
+ auto *worker = new RerankWorker(info, _sess, query, documents, rerankParams);
1018
+ worker->Queue();
1019
+ return worker->Promise();
1020
+ }
1021
+
979
1022
  // saveSession(path: string): Promise<void> throws error
980
1023
  Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
981
1024
  Napi::Env env = info.Env();
@@ -28,6 +28,7 @@ private:
28
28
  Napi::Value Tokenize(const Napi::CallbackInfo &info);
29
29
  Napi::Value Detokenize(const Napi::CallbackInfo &info);
30
30
  Napi::Value Embedding(const Napi::CallbackInfo &info);
31
+ Napi::Value Rerank(const Napi::CallbackInfo &info);
31
32
  Napi::Value SaveSession(const Napi::CallbackInfo &info);
32
33
  Napi::Value LoadSession(const Napi::CallbackInfo &info);
33
34
  void ApplyLoraAdapters(const Napi::CallbackInfo &info);
@@ -0,0 +1,26 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct RerankResult {
5
+ std::vector<float> scores;
6
+ };
7
+
8
+ class RerankWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ RerankWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string query, std::vector<std::string> documents,
13
+ common_params &params);
14
+
15
+ protected:
16
+ void Execute();
17
+ void OnOK();
18
+ void OnError(const Napi::Error &err);
19
+
20
+ private:
21
+ LlamaSessionPtr _sess;
22
+ std::string _query;
23
+ std::vector<std::string> _documents;
24
+ common_params _params;
25
+ RerankResult _result;
26
+ };
package/src/common.hpp CHANGED
@@ -461,7 +461,7 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
461
461
  }
462
462
 
463
463
  // Clear all KV cache entries after position n_past
464
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
464
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
465
465
 
466
466
  size_t num_chunks = mtmd_input_chunks_size(chunks);
467
467
 
@@ -95,7 +95,7 @@ endif()
95
95
  if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
96
  set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
97
  endif()
98
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
98
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
99
99
 
100
100
  # override ggml options
101
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45
- class string_view {
46
- const std::string & _str;
47
- const size_t _start;
48
- const size_t _end;
49
- public:
50
- string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51
-
52
- size_t size() const {
53
- return _end - _start;
54
- }
55
-
56
- size_t length() const {
57
- return size();
58
- }
59
-
60
- operator std::string() const {
61
- return str();
62
- }
63
-
64
- std::string str() const {
65
- return _str.substr(_start, _end - _start);
66
- }
67
-
68
- string_view substr(size_t pos, size_t len = std::string::npos) const {
69
- return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70
- }
71
-
72
- char operator[](size_t pos) const {
73
- auto index = _start + pos;
74
- if (index >= _end) {
75
- throw std::out_of_range("string_view index out of range");
76
- }
77
- return _str[_start + pos];
78
- }
79
-
80
- bool operator==(const string_view & other) const {
81
- std::string this_str = *this;
82
- std::string other_str = other;
83
- return this_str == other_str;
84
- }
85
- };
86
-
87
44
  static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
88
45
  auto has_min = min_value != std::numeric_limits<int>::min();
89
46
  auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
112
69
  }
113
70
  out << "}";
114
71
  };
115
- std::function<void(const string_view &, const string_view &)> uniform_range =
116
- [&](const string_view & from, const string_view & to) {
72
+ std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73
+ [&](const std::string_view & from, const std::string_view & to) {
117
74
  size_t i = 0;
118
75
  while (i < from.length() && i < to.length() && from[i] == to[i]) {
119
76
  i++;
120
77
  }
121
78
  if (i > 0) {
122
- out << "\"" << from.substr(0, i).str() << "\"";
79
+ out << "\"" << from.substr(0, i) << "\"";
123
80
  }
124
81
  if (i < from.length() && i < to.length()) {
125
82
  if (i > 0) {
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" ON)
134
135
 
135
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
136
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
448
448
 
449
449
  # TODO: Separation to determine activation of VX/VXE/VXE2
450
450
  if (${S390X_M} MATCHES "8561|8562")
451
+ set(GGML_NNPA OFF)
451
452
  message(STATUS "z15 target")
452
453
  list(APPEND ARCH_FLAGS -march=z15)
453
454
  elseif (${S390X_M} MATCHES "3931")
@@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
464
465
  endif()
465
466
 
466
467
  if (GGML_VXE)
468
+ message(STATUS "VX/VXE/VXE2 enabled")
467
469
  list(APPEND ARCH_FLAGS -mvx -mzvector)
470
+ list(APPEND ARCH_DEFINITIONS GGML_VXE)
471
+ endif()
472
+
473
+ if (GGML_NNPA)
474
+ message(STATUS "NNPA enabled")
475
+ list(APPEND ARCH_DEFINITIONS GGML_NNPA)
468
476
  endif()
469
477
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
470
478
  message(STATUS "Wasm detected")
@@ -8,6 +8,7 @@
8
8
  #include "mmq.h"
9
9
  #include "ggml-impl.h"
10
10
  #include "ggml-cpu-impl.h"
11
+ #include "simd-mappings.h"
11
12
  #include "quants.h"
12
13
  #include "ggml-quants.h"
13
14
  #include <algorithm>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
453
454
 
454
455
  // Quantize these floats
455
456
  const float iscale = 127.f / amax;
456
- y[i].d = GGML_FP32_TO_FP16(1 / iscale);
457
+ y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
457
458
  const float id = ( amax != 0.0f ) ? iscale : 0.f;
458
459
  const __m512 vscale = _mm512_set1_ps(id);
459
460
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
1090
1091
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1091
1092
 
1092
1093
  for (int m = 0; m < nr; ++m) {
1093
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1094
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1094
1095
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1095
1096
 
1096
1097
  __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
1113
1114
  const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
1114
1115
 
1115
1116
  for (int m = 0; m < nr; ++m) {
1116
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1117
- const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
1117
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118
+ const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1118
1119
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1119
1120
 
1120
1121
  __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
1137
1138
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1138
1139
 
1139
1140
  for (int m = 0; m < nr; ++m) {
1140
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1141
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1141
1142
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1142
1143
 
1143
1144
  __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
1437
1438
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1438
1439
  vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
1439
1440
  }
1440
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1441
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1441
1442
  }
1442
1443
 
1443
1444
  // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
1498
1499
  for (int k = 0; k < 8; ++k) {
1499
1500
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1500
1501
  }
1501
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502
- vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
1502
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503
+ vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1503
1504
  }
1504
1505
 
1505
1506
  // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
1571
1572
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1572
1573
  va[k] = _mm512_add_epi8(va[k], off);
1573
1574
  }
1574
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1575
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1575
1576
  }
1576
1577
 
1577
1578
  // load b