@fugood/llama.node 1.0.0-beta.7 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +58 -8
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
package/CMakeLists.txt CHANGED
@@ -140,6 +140,8 @@ file(
140
140
  "src/DetokenizeWorker.h"
141
141
  "src/EmbeddingWorker.cpp"
142
142
  "src/EmbeddingWorker.h"
143
+ "src/RerankWorker.cpp"
144
+ "src/RerankWorker.h"
143
145
  "src/LoadSessionWorker.cpp"
144
146
  "src/LoadSessionWorker.h"
145
147
  "src/SaveSessionWorker.cpp"
package/lib/binding.ts CHANGED
@@ -159,6 +159,15 @@ export type EmbeddingResult = {
159
159
  embedding: Float32Array
160
160
  }
161
161
 
162
+ export type RerankParams = {
163
+ normalize?: number
164
+ }
165
+
166
+ export type RerankResult = {
167
+ score: number
168
+ index: number
169
+ }
170
+
162
171
  export interface LlamaContext {
163
172
  new (options: LlamaModelOptions): LlamaContext
164
173
  getSystemInfo(): string
@@ -182,6 +191,7 @@ export interface LlamaContext {
182
191
  tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
183
192
  detokenize(tokens: number[]): Promise<string>
184
193
  embedding(text: string): Promise<EmbeddingResult>
194
+ rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
185
195
  saveSession(path: string): Promise<void>
186
196
  loadSession(path: string): Promise<void>
187
197
  release(): Promise<void>
package/lib/index.js CHANGED
@@ -176,6 +176,14 @@ class LlamaContextWrapper {
176
176
  embedding(text) {
177
177
  return this.ctx.embedding(text);
178
178
  }
179
+ rerank(query, documents, params) {
180
+ return this.ctx.rerank(query, documents, params).then((results) => {
181
+ // Sort by score descending and add document text for convenience
182
+ return results
183
+ .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
184
+ .sort((a, b) => b.score - a.score);
185
+ });
186
+ }
179
187
  saveSession(path) {
180
188
  return this.ctx.saveSession(path);
181
189
  }
package/lib/index.ts CHANGED
@@ -9,6 +9,8 @@ import type {
9
9
  LlamaCompletionResult,
10
10
  TokenizeResult,
11
11
  EmbeddingResult,
12
+ RerankParams,
13
+ RerankResult,
12
14
  CompletionResponseFormat,
13
15
  } from './binding'
14
16
 
@@ -226,6 +228,18 @@ class LlamaContextWrapper {
226
228
  return this.ctx.embedding(text)
227
229
  }
228
230
 
231
+ rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
232
+ return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
233
+ // Sort by score descending and add document text for convenience
234
+ return results
235
+ .map((result: RerankResult) => ({
236
+ ...result,
237
+ document: documents[result.index],
238
+ }))
239
+ .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
240
+ })
241
+ }
242
+
229
243
  saveSession(path: string): Promise<void> {
230
244
  return this.ctx.saveSession(path)
231
245
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.0-beta.7",
4
+ "version": "1.0.1",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.0-beta.7",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.7",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.7",
76
- "@fugood/node-llama-linux-arm64": "1.0.0-beta.7",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.7",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.7",
79
- "@fugood/node-llama-win32-x64": "1.0.0-beta.7",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.7",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.7",
82
- "@fugood/node-llama-win32-arm64": "1.0.0-beta.7",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.7",
84
- "@fugood/node-llama-darwin-x64": "1.0.0-beta.7",
85
- "@fugood/node-llama-darwin-arm64": "1.0.0-beta.7"
73
+ "@fugood/node-llama-linux-x64": "1.0.1",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.1",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.1",
76
+ "@fugood/node-llama-linux-arm64": "1.0.1",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.1",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.1",
79
+ "@fugood/node-llama-win32-x64": "1.0.1",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.1",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.1",
82
+ "@fugood/node-llama-win32-arm64": "1.0.1",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.1",
84
+ "@fugood/node-llama-darwin-x64": "1.0.1",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.1"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -3,6 +3,7 @@
3
3
  #include "DetokenizeWorker.h"
4
4
  #include "DisposeWorker.h"
5
5
  #include "EmbeddingWorker.h"
6
+ #include "RerankWorker.h"
6
7
  #include "LlamaCompletionWorker.h"
7
8
  #include "LoadSessionWorker.h"
8
9
  #include "SaveSessionWorker.h"
@@ -110,6 +111,8 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
110
111
  static_cast<napi_property_attributes>(napi_enumerable)),
111
112
  InstanceMethod<&LlamaContext::Embedding>(
112
113
  "embedding", static_cast<napi_property_attributes>(napi_enumerable)),
114
+ InstanceMethod<&LlamaContext::Rerank>(
115
+ "rerank", static_cast<napi_property_attributes>(napi_enumerable)),
113
116
  InstanceMethod<&LlamaContext::SaveSession>(
114
117
  "saveSession",
115
118
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -583,7 +586,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
583
586
  : "{}";
584
587
  }
585
588
  }
586
- auto tools_str = params.Has("tools")
589
+ auto tools_str = !is_nil(params.Get("tools"))
587
590
  ? json_stringify(params.Get("tools").As<Napi::Array>())
588
591
  : "";
589
592
  auto parallel_tool_calls =
@@ -591,9 +594,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
591
594
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
592
595
  auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
593
596
 
594
- auto chatParams = getFormattedChatWithJinja(
595
- _sess, _templates, messages, chat_template, json_schema_str, tools_str,
596
- parallel_tool_calls, tool_choice, enable_thinking);
597
+ common_chat_params chatParams;
598
+ try {
599
+ chatParams = getFormattedChatWithJinja(
600
+ _sess, _templates, messages, chat_template, json_schema_str, tools_str,
601
+ parallel_tool_calls, tool_choice, enable_thinking);
602
+ } catch (const std::exception &e) {
603
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
604
+ return env.Undefined();
605
+ }
597
606
 
598
607
  Napi::Object result = Napi::Object::New(env);
599
608
  result.Set("prompt", chatParams.prompt);
@@ -790,7 +799,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
790
799
  auto jinja = get_option<bool>(options, "jinja", false);
791
800
  if (jinja) {
792
801
  auto tools_str =
793
- options.Has("tools")
802
+ !is_nil(options.Get("tools"))
794
803
  ? json_stringify(options.Get("tools").As<Napi::Array>())
795
804
  : "";
796
805
  auto parallel_tool_calls =
@@ -799,9 +808,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
799
808
  get_option<std::string>(options, "tool_choice", "none");
800
809
  auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
801
810
 
802
- auto chatParams = getFormattedChatWithJinja(
803
- _sess, _templates, json_stringify(messages), chat_template,
804
- json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
811
+ common_chat_params chatParams;
812
+
813
+ try {
814
+ chatParams = getFormattedChatWithJinja(
815
+ _sess, _templates, json_stringify(messages), chat_template,
816
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
817
+ } catch (const std::exception &e) {
818
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
819
+ return env.Undefined();
820
+ }
805
821
 
806
822
  params.prompt = chatParams.prompt;
807
823
 
@@ -982,6 +998,40 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
982
998
  return worker->Promise();
983
999
  }
984
1000
 
1001
+ // rerank(query: string, documents: string[], params?: object): Promise<RerankResult[]>
1002
+ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
1003
+ Napi::Env env = info.Env();
1004
+ if (info.Length() < 2 || !info[0].IsString() || !info[1].IsArray()) {
1005
+ Napi::TypeError::New(env, "Query string and documents array expected").ThrowAsJavaScriptException();
1006
+ }
1007
+ if (_sess == nullptr) {
1008
+ Napi::TypeError::New(env, "Context is disposed")
1009
+ .ThrowAsJavaScriptException();
1010
+ }
1011
+
1012
+ auto query = info[0].ToString().Utf8Value();
1013
+ auto documents_array = info[1].As<Napi::Array>();
1014
+
1015
+ // Convert documents array to vector
1016
+ std::vector<std::string> documents;
1017
+ for (size_t i = 0; i < documents_array.Length(); i++) {
1018
+ documents.push_back(documents_array.Get(i).ToString().Utf8Value());
1019
+ }
1020
+
1021
+ auto options = Napi::Object::New(env);
1022
+ if (info.Length() >= 3 && info[2].IsObject()) {
1023
+ options = info[2].As<Napi::Object>();
1024
+ }
1025
+
1026
+ common_params rerankParams;
1027
+ rerankParams.embedding = true;
1028
+ rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
1029
+
1030
+ auto *worker = new RerankWorker(info, _sess, query, documents, rerankParams);
1031
+ worker->Queue();
1032
+ return worker->Promise();
1033
+ }
1034
+
985
1035
  // saveSession(path: string): Promise<void> throws error
986
1036
  Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
987
1037
  Napi::Env env = info.Env();
@@ -28,6 +28,7 @@ private:
28
28
  Napi::Value Tokenize(const Napi::CallbackInfo &info);
29
29
  Napi::Value Detokenize(const Napi::CallbackInfo &info);
30
30
  Napi::Value Embedding(const Napi::CallbackInfo &info);
31
+ Napi::Value Rerank(const Napi::CallbackInfo &info);
31
32
  Napi::Value SaveSession(const Napi::CallbackInfo &info);
32
33
  Napi::Value LoadSession(const Napi::CallbackInfo &info);
33
34
  void ApplyLoraAdapters(const Napi::CallbackInfo &info);
@@ -0,0 +1,26 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct RerankResult {
5
+ std::vector<float> scores;
6
+ };
7
+
8
+ class RerankWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ RerankWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string query, std::vector<std::string> documents,
13
+ common_params &params);
14
+
15
+ protected:
16
+ void Execute();
17
+ void OnOK();
18
+ void OnError(const Napi::Error &err);
19
+
20
+ private:
21
+ LlamaSessionPtr _sess;
22
+ std::string _query;
23
+ std::vector<std::string> _documents;
24
+ common_params _params;
25
+ RerankResult _result;
26
+ };
@@ -95,7 +95,7 @@ endif()
95
95
  if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
96
  set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
97
  endif()
98
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
98
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
99
99
 
100
100
  # override ggml options
101
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45
- class string_view {
46
- const std::string & _str;
47
- const size_t _start;
48
- const size_t _end;
49
- public:
50
- string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51
-
52
- size_t size() const {
53
- return _end - _start;
54
- }
55
-
56
- size_t length() const {
57
- return size();
58
- }
59
-
60
- operator std::string() const {
61
- return str();
62
- }
63
-
64
- std::string str() const {
65
- return _str.substr(_start, _end - _start);
66
- }
67
-
68
- string_view substr(size_t pos, size_t len = std::string::npos) const {
69
- return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70
- }
71
-
72
- char operator[](size_t pos) const {
73
- auto index = _start + pos;
74
- if (index >= _end) {
75
- throw std::out_of_range("string_view index out of range");
76
- }
77
- return _str[_start + pos];
78
- }
79
-
80
- bool operator==(const string_view & other) const {
81
- std::string this_str = *this;
82
- std::string other_str = other;
83
- return this_str == other_str;
84
- }
85
- };
86
-
87
44
  static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
88
45
  auto has_min = min_value != std::numeric_limits<int>::min();
89
46
  auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
112
69
  }
113
70
  out << "}";
114
71
  };
115
- std::function<void(const string_view &, const string_view &)> uniform_range =
116
- [&](const string_view & from, const string_view & to) {
72
+ std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73
+ [&](const std::string_view & from, const std::string_view & to) {
117
74
  size_t i = 0;
118
75
  while (i < from.length() && i < to.length() && from[i] == to[i]) {
119
76
  i++;
120
77
  }
121
78
  if (i > 0) {
122
- out << "\"" << from.substr(0, i).str() << "\"";
79
+ out << "\"" << from.substr(0, i) << "\"";
123
80
  }
124
81
  if (i < from.length() && i < to.length()) {
125
82
  if (i > 0) {
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" ON)
134
135
 
135
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
136
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
448
448
 
449
449
  # TODO: Separation to determine activation of VX/VXE/VXE2
450
450
  if (${S390X_M} MATCHES "8561|8562")
451
+ set(GGML_NNPA OFF)
451
452
  message(STATUS "z15 target")
452
453
  list(APPEND ARCH_FLAGS -march=z15)
453
454
  elseif (${S390X_M} MATCHES "3931")
@@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
464
465
  endif()
465
466
 
466
467
  if (GGML_VXE)
468
+ message(STATUS "VX/VXE/VXE2 enabled")
467
469
  list(APPEND ARCH_FLAGS -mvx -mzvector)
470
+ list(APPEND ARCH_DEFINITIONS GGML_VXE)
471
+ endif()
472
+
473
+ if (GGML_NNPA)
474
+ message(STATUS "NNPA enabled")
475
+ list(APPEND ARCH_DEFINITIONS GGML_NNPA)
468
476
  endif()
469
477
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
470
478
  message(STATUS "Wasm detected")
@@ -8,6 +8,7 @@
8
8
  #include "mmq.h"
9
9
  #include "ggml-impl.h"
10
10
  #include "ggml-cpu-impl.h"
11
+ #include "simd-mappings.h"
11
12
  #include "quants.h"
12
13
  #include "ggml-quants.h"
13
14
  #include <algorithm>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
453
454
 
454
455
  // Quantize these floats
455
456
  const float iscale = 127.f / amax;
456
- y[i].d = GGML_FP32_TO_FP16(1 / iscale);
457
+ y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
457
458
  const float id = ( amax != 0.0f ) ? iscale : 0.f;
458
459
  const __m512 vscale = _mm512_set1_ps(id);
459
460
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
1090
1091
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1091
1092
 
1092
1093
  for (int m = 0; m < nr; ++m) {
1093
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1094
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1094
1095
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1095
1096
 
1096
1097
  __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
1113
1114
  const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
1114
1115
 
1115
1116
  for (int m = 0; m < nr; ++m) {
1116
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1117
- const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
1117
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118
+ const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1118
1119
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1119
1120
 
1120
1121
  __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
1137
1138
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1138
1139
 
1139
1140
  for (int m = 0; m < nr; ++m) {
1140
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1141
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1141
1142
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1142
1143
 
1143
1144
  __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
1437
1438
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1438
1439
  vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
1439
1440
  }
1440
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1441
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1441
1442
  }
1442
1443
 
1443
1444
  // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
1498
1499
  for (int k = 0; k < 8; ++k) {
1499
1500
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1500
1501
  }
1501
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502
- vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
1502
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503
+ vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1503
1504
  }
1504
1505
 
1505
1506
  // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
1571
1572
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1572
1573
  va[k] = _mm512_add_epi8(va[k], off);
1573
1574
  }
1574
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1575
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1575
1576
  }
1576
1577
 
1577
1578
  // load b