@fugood/llama.node 1.0.0-beta.7 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +58 -8
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
package/CMakeLists.txt
CHANGED
package/lib/binding.ts
CHANGED
|
@@ -159,6 +159,15 @@ export type EmbeddingResult = {
|
|
|
159
159
|
embedding: Float32Array
|
|
160
160
|
}
|
|
161
161
|
|
|
162
|
+
export type RerankParams = {
|
|
163
|
+
normalize?: number
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
export type RerankResult = {
|
|
167
|
+
score: number
|
|
168
|
+
index: number
|
|
169
|
+
}
|
|
170
|
+
|
|
162
171
|
export interface LlamaContext {
|
|
163
172
|
new (options: LlamaModelOptions): LlamaContext
|
|
164
173
|
getSystemInfo(): string
|
|
@@ -182,6 +191,7 @@ export interface LlamaContext {
|
|
|
182
191
|
tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
|
|
183
192
|
detokenize(tokens: number[]): Promise<string>
|
|
184
193
|
embedding(text: string): Promise<EmbeddingResult>
|
|
194
|
+
rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
|
|
185
195
|
saveSession(path: string): Promise<void>
|
|
186
196
|
loadSession(path: string): Promise<void>
|
|
187
197
|
release(): Promise<void>
|
package/lib/index.js
CHANGED
|
@@ -176,6 +176,14 @@ class LlamaContextWrapper {
|
|
|
176
176
|
embedding(text) {
|
|
177
177
|
return this.ctx.embedding(text);
|
|
178
178
|
}
|
|
179
|
+
rerank(query, documents, params) {
|
|
180
|
+
return this.ctx.rerank(query, documents, params).then((results) => {
|
|
181
|
+
// Sort by score descending and add document text for convenience
|
|
182
|
+
return results
|
|
183
|
+
.map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))
|
|
184
|
+
.sort((a, b) => b.score - a.score);
|
|
185
|
+
});
|
|
186
|
+
}
|
|
179
187
|
saveSession(path) {
|
|
180
188
|
return this.ctx.saveSession(path);
|
|
181
189
|
}
|
package/lib/index.ts
CHANGED
|
@@ -9,6 +9,8 @@ import type {
|
|
|
9
9
|
LlamaCompletionResult,
|
|
10
10
|
TokenizeResult,
|
|
11
11
|
EmbeddingResult,
|
|
12
|
+
RerankParams,
|
|
13
|
+
RerankResult,
|
|
12
14
|
CompletionResponseFormat,
|
|
13
15
|
} from './binding'
|
|
14
16
|
|
|
@@ -226,6 +228,18 @@ class LlamaContextWrapper {
|
|
|
226
228
|
return this.ctx.embedding(text)
|
|
227
229
|
}
|
|
228
230
|
|
|
231
|
+
rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
|
|
232
|
+
return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
|
|
233
|
+
// Sort by score descending and add document text for convenience
|
|
234
|
+
return results
|
|
235
|
+
.map((result: RerankResult) => ({
|
|
236
|
+
...result,
|
|
237
|
+
document: documents[result.index],
|
|
238
|
+
}))
|
|
239
|
+
.sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
|
|
240
|
+
})
|
|
241
|
+
}
|
|
242
|
+
|
|
229
243
|
saveSession(path: string): Promise<void> {
|
|
230
244
|
return this.ctx.saveSession(path)
|
|
231
245
|
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.1",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.1",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.1",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.1",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.1",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.1",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.1",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.1",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.1",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.1",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.1",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.1",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.1",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.1"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "DetokenizeWorker.h"
|
|
4
4
|
#include "DisposeWorker.h"
|
|
5
5
|
#include "EmbeddingWorker.h"
|
|
6
|
+
#include "RerankWorker.h"
|
|
6
7
|
#include "LlamaCompletionWorker.h"
|
|
7
8
|
#include "LoadSessionWorker.h"
|
|
8
9
|
#include "SaveSessionWorker.h"
|
|
@@ -110,6 +111,8 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
110
111
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
111
112
|
InstanceMethod<&LlamaContext::Embedding>(
|
|
112
113
|
"embedding", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
114
|
+
InstanceMethod<&LlamaContext::Rerank>(
|
|
115
|
+
"rerank", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
113
116
|
InstanceMethod<&LlamaContext::SaveSession>(
|
|
114
117
|
"saveSession",
|
|
115
118
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
@@ -583,7 +586,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
583
586
|
: "{}";
|
|
584
587
|
}
|
|
585
588
|
}
|
|
586
|
-
auto tools_str = params.
|
|
589
|
+
auto tools_str = !is_nil(params.Get("tools"))
|
|
587
590
|
? json_stringify(params.Get("tools").As<Napi::Array>())
|
|
588
591
|
: "";
|
|
589
592
|
auto parallel_tool_calls =
|
|
@@ -591,9 +594,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
591
594
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
592
595
|
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
593
596
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
+
common_chat_params chatParams;
|
|
598
|
+
try {
|
|
599
|
+
chatParams = getFormattedChatWithJinja(
|
|
600
|
+
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
601
|
+
parallel_tool_calls, tool_choice, enable_thinking);
|
|
602
|
+
} catch (const std::exception &e) {
|
|
603
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
604
|
+
return env.Undefined();
|
|
605
|
+
}
|
|
597
606
|
|
|
598
607
|
Napi::Object result = Napi::Object::New(env);
|
|
599
608
|
result.Set("prompt", chatParams.prompt);
|
|
@@ -790,7 +799,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
790
799
|
auto jinja = get_option<bool>(options, "jinja", false);
|
|
791
800
|
if (jinja) {
|
|
792
801
|
auto tools_str =
|
|
793
|
-
options.
|
|
802
|
+
!is_nil(options.Get("tools"))
|
|
794
803
|
? json_stringify(options.Get("tools").As<Napi::Array>())
|
|
795
804
|
: "";
|
|
796
805
|
auto parallel_tool_calls =
|
|
@@ -799,9 +808,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
799
808
|
get_option<std::string>(options, "tool_choice", "none");
|
|
800
809
|
auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
|
|
801
810
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
811
|
+
common_chat_params chatParams;
|
|
812
|
+
|
|
813
|
+
try {
|
|
814
|
+
chatParams = getFormattedChatWithJinja(
|
|
815
|
+
_sess, _templates, json_stringify(messages), chat_template,
|
|
816
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
|
|
817
|
+
} catch (const std::exception &e) {
|
|
818
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
819
|
+
return env.Undefined();
|
|
820
|
+
}
|
|
805
821
|
|
|
806
822
|
params.prompt = chatParams.prompt;
|
|
807
823
|
|
|
@@ -982,6 +998,40 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
|
|
|
982
998
|
return worker->Promise();
|
|
983
999
|
}
|
|
984
1000
|
|
|
1001
|
+
// rerank(query: string, documents: string[], params?: object): Promise<RerankResult[]>
|
|
1002
|
+
Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
|
|
1003
|
+
Napi::Env env = info.Env();
|
|
1004
|
+
if (info.Length() < 2 || !info[0].IsString() || !info[1].IsArray()) {
|
|
1005
|
+
Napi::TypeError::New(env, "Query string and documents array expected").ThrowAsJavaScriptException();
|
|
1006
|
+
}
|
|
1007
|
+
if (_sess == nullptr) {
|
|
1008
|
+
Napi::TypeError::New(env, "Context is disposed")
|
|
1009
|
+
.ThrowAsJavaScriptException();
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
auto query = info[0].ToString().Utf8Value();
|
|
1013
|
+
auto documents_array = info[1].As<Napi::Array>();
|
|
1014
|
+
|
|
1015
|
+
// Convert documents array to vector
|
|
1016
|
+
std::vector<std::string> documents;
|
|
1017
|
+
for (size_t i = 0; i < documents_array.Length(); i++) {
|
|
1018
|
+
documents.push_back(documents_array.Get(i).ToString().Utf8Value());
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
auto options = Napi::Object::New(env);
|
|
1022
|
+
if (info.Length() >= 3 && info[2].IsObject()) {
|
|
1023
|
+
options = info[2].As<Napi::Object>();
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
common_params rerankParams;
|
|
1027
|
+
rerankParams.embedding = true;
|
|
1028
|
+
rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
|
|
1029
|
+
|
|
1030
|
+
auto *worker = new RerankWorker(info, _sess, query, documents, rerankParams);
|
|
1031
|
+
worker->Queue();
|
|
1032
|
+
return worker->Promise();
|
|
1033
|
+
}
|
|
1034
|
+
|
|
985
1035
|
// saveSession(path: string): Promise<void> throws error
|
|
986
1036
|
Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
|
|
987
1037
|
Napi::Env env = info.Env();
|
package/src/LlamaContext.h
CHANGED
|
@@ -28,6 +28,7 @@ private:
|
|
|
28
28
|
Napi::Value Tokenize(const Napi::CallbackInfo &info);
|
|
29
29
|
Napi::Value Detokenize(const Napi::CallbackInfo &info);
|
|
30
30
|
Napi::Value Embedding(const Napi::CallbackInfo &info);
|
|
31
|
+
Napi::Value Rerank(const Napi::CallbackInfo &info);
|
|
31
32
|
Napi::Value SaveSession(const Napi::CallbackInfo &info);
|
|
32
33
|
Napi::Value LoadSession(const Napi::CallbackInfo &info);
|
|
33
34
|
void ApplyLoraAdapters(const Napi::CallbackInfo &info);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
struct RerankResult {
|
|
5
|
+
std::vector<float> scores;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
class RerankWorker : public Napi::AsyncWorker,
|
|
9
|
+
public Napi::Promise::Deferred {
|
|
10
|
+
public:
|
|
11
|
+
RerankWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
12
|
+
std::string query, std::vector<std::string> documents,
|
|
13
|
+
common_params ¶ms);
|
|
14
|
+
|
|
15
|
+
protected:
|
|
16
|
+
void Execute();
|
|
17
|
+
void OnOK();
|
|
18
|
+
void OnError(const Napi::Error &err);
|
|
19
|
+
|
|
20
|
+
private:
|
|
21
|
+
LlamaSessionPtr _sess;
|
|
22
|
+
std::string _query;
|
|
23
|
+
std::vector<std::string> _documents;
|
|
24
|
+
common_params _params;
|
|
25
|
+
RerankResult _result;
|
|
26
|
+
};
|
|
@@ -95,7 +95,7 @@ endif()
|
|
|
95
95
|
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
96
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
97
|
endif()
|
|
98
|
-
set(LLAMA_INSTALL_VERSION 0.0.${
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
99
99
|
|
|
100
100
|
# override ggml options
|
|
101
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
|
45
|
-
class string_view {
|
|
46
|
-
const std::string & _str;
|
|
47
|
-
const size_t _start;
|
|
48
|
-
const size_t _end;
|
|
49
|
-
public:
|
|
50
|
-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
|
51
|
-
|
|
52
|
-
size_t size() const {
|
|
53
|
-
return _end - _start;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
size_t length() const {
|
|
57
|
-
return size();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
operator std::string() const {
|
|
61
|
-
return str();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
std::string str() const {
|
|
65
|
-
return _str.substr(_start, _end - _start);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
|
69
|
-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
char operator[](size_t pos) const {
|
|
73
|
-
auto index = _start + pos;
|
|
74
|
-
if (index >= _end) {
|
|
75
|
-
throw std::out_of_range("string_view index out of range");
|
|
76
|
-
}
|
|
77
|
-
return _str[_start + pos];
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
bool operator==(const string_view & other) const {
|
|
81
|
-
std::string this_str = *this;
|
|
82
|
-
std::string other_str = other;
|
|
83
|
-
return this_str == other_str;
|
|
84
|
-
}
|
|
85
|
-
};
|
|
86
|
-
|
|
87
44
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
88
45
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
|
89
46
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
|
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
112
69
|
}
|
|
113
70
|
out << "}";
|
|
114
71
|
};
|
|
115
|
-
std::function<void(const string_view &, const string_view &)> uniform_range =
|
|
116
|
-
[&](const string_view & from, const string_view & to) {
|
|
72
|
+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
|
|
73
|
+
[&](const std::string_view & from, const std::string_view & to) {
|
|
117
74
|
size_t i = 0;
|
|
118
75
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
|
119
76
|
i++;
|
|
120
77
|
}
|
|
121
78
|
if (i > 0) {
|
|
122
|
-
out << "\"" << from.substr(0, i)
|
|
79
|
+
out << "\"" << from.substr(0, i) << "\"";
|
|
123
80
|
}
|
|
124
81
|
if (i < from.length() && i < to.length()) {
|
|
125
82
|
if (i > 0) {
|
|
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" ON)
|
|
134
135
|
|
|
135
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
136
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -101,6 +101,7 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
105
106
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
106
107
|
|
|
@@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
448
448
|
|
|
449
449
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
450
450
|
if (${S390X_M} MATCHES "8561|8562")
|
|
451
|
+
set(GGML_NNPA OFF)
|
|
451
452
|
message(STATUS "z15 target")
|
|
452
453
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
453
454
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
464
465
|
endif()
|
|
465
466
|
|
|
466
467
|
if (GGML_VXE)
|
|
468
|
+
message(STATUS "VX/VXE/VXE2 enabled")
|
|
467
469
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
470
|
+
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
471
|
+
endif()
|
|
472
|
+
|
|
473
|
+
if (GGML_NNPA)
|
|
474
|
+
message(STATUS "NNPA enabled")
|
|
475
|
+
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
468
476
|
endif()
|
|
469
477
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
470
478
|
message(STATUS "Wasm detected")
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include "mmq.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
10
|
#include "ggml-cpu-impl.h"
|
|
11
|
+
#include "simd-mappings.h"
|
|
11
12
|
#include "quants.h"
|
|
12
13
|
#include "ggml-quants.h"
|
|
13
14
|
#include <algorithm>
|
|
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
|
|
453
454
|
|
|
454
455
|
// Quantize these floats
|
|
455
456
|
const float iscale = 127.f / amax;
|
|
456
|
-
y[i].d =
|
|
457
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
|
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
|
459
460
|
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1091
1092
|
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1095
1096
|
|
|
1096
1097
|
__m512 vsum;
|
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
|
|
1114
1115
|
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1119
1120
|
|
|
1120
1121
|
__m512 vsum;
|
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1138
1139
|
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1142
1143
|
|
|
1143
1144
|
__m512 vsum;
|
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
|
1439
1440
|
}
|
|
1440
|
-
vd1 = _mm512_set1_ps(
|
|
1441
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1441
1442
|
}
|
|
1442
1443
|
|
|
1443
1444
|
// load b
|
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1500
1501
|
}
|
|
1501
|
-
vd1 = _mm512_set1_ps(
|
|
1502
|
-
vs1 = _mm512_set1_ps(
|
|
1502
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1503
|
+
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
|
1503
1504
|
}
|
|
1504
1505
|
|
|
1505
1506
|
// load b
|
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
|
1573
1574
|
}
|
|
1574
|
-
vd1 = _mm512_set1_ps(
|
|
1575
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1575
1576
|
}
|
|
1576
1577
|
|
|
1577
1578
|
// load b
|