@fugood/llama.node 0.0.1-alpha.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +42 -7
- package/README.md +10 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +16 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/DetokenizeWorker.cpp +22 -0
- package/src/DetokenizeWorker.h +19 -0
- package/src/EmbeddingWorker.cpp +46 -0
- package/src/EmbeddingWorker.h +23 -0
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +80 -1
- package/src/LlamaContext.h +3 -0
- package/src/TokenizeWorker.cpp +26 -0
- package/src/TokenizeWorker.h +23 -0
- package/src/common.hpp +12 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
package/CMakeLists.txt
CHANGED
|
@@ -26,7 +26,7 @@ string(REPLACE "i686" "ia32" ARCH ${ARCH})
|
|
|
26
26
|
string(REPLACE "i386" "ia32" ARCH ${ARCH})
|
|
27
27
|
string(REPLACE "armv7l" "arm" ARCH ${ARCH})
|
|
28
28
|
string(REPLACE "arm" "arm" ARCH ${ARCH})
|
|
29
|
-
string(REPLACE "
|
|
29
|
+
string(REPLACE "arm64x" "arm64" ARCH ${ARCH})
|
|
30
30
|
string(REPLACE "aarch64" "arm64" ARCH ${ARCH})
|
|
31
31
|
|
|
32
32
|
if(DEFINED VARIANT)
|
|
@@ -58,6 +58,12 @@ include_directories(${CMAKE_JS_INC})
|
|
|
58
58
|
# flags: -fPIC
|
|
59
59
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
60
60
|
|
|
61
|
+
# VULKAN_SDK
|
|
62
|
+
if (VULKAN_SDK)
|
|
63
|
+
set(ENV{VULKAN_SDK} ${VULKAN_SDK})
|
|
64
|
+
find_package(Vulkan REQUIRED)
|
|
65
|
+
endif()
|
|
66
|
+
|
|
61
67
|
set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
|
|
62
68
|
add_subdirectory("src/llama.cpp")
|
|
63
69
|
|
|
@@ -71,6 +77,12 @@ file(
|
|
|
71
77
|
"src/LlamaCompletionWorker.h"
|
|
72
78
|
"src/LlamaContext.cpp"
|
|
73
79
|
"src/LlamaContext.h"
|
|
80
|
+
"src/TokenizeWorker.cpp"
|
|
81
|
+
"src/TokenizeWorker.h"
|
|
82
|
+
"src/DetokenizeWorker.cpp"
|
|
83
|
+
"src/DetokenizeWorker.h"
|
|
84
|
+
"src/EmbeddingWorker.cpp"
|
|
85
|
+
"src/EmbeddingWorker.h"
|
|
74
86
|
"src/LoadSessionWorker.cpp"
|
|
75
87
|
"src/LoadSessionWorker.h"
|
|
76
88
|
"src/SaveSessionWorker.cpp"
|
|
@@ -81,31 +93,54 @@ add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
|
|
|
81
93
|
set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
|
|
82
94
|
target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common)
|
|
83
95
|
|
|
96
|
+
add_custom_target(copy_assets ALL DEPENDS ${PROJECT_NAME})
|
|
97
|
+
|
|
98
|
+
add_custom_command(
|
|
99
|
+
TARGET copy_assets
|
|
100
|
+
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PLATFORM_BINARY_DIR}
|
|
101
|
+
COMMENT "Cleaning bin folder"
|
|
102
|
+
)
|
|
103
|
+
|
|
84
104
|
if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
|
|
85
105
|
# Generate node.lib
|
|
86
106
|
execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
|
|
87
107
|
# copy target to bin folder
|
|
88
108
|
get_filename_component(CMAKE_JS_NODELIB_TARGET_NAME ${CMAKE_JS_NODELIB_TARGET} NAME)
|
|
89
|
-
add_custom_command(TARGET
|
|
109
|
+
add_custom_command(TARGET copy_assets
|
|
90
110
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_NODELIB_TARGET} ${PLATFORM_BINARY_DIR}/${CMAKE_JS_NODELIB_TARGET_NAME}
|
|
91
111
|
COMMENT "Copying to bin folder"
|
|
92
112
|
)
|
|
93
113
|
endif()
|
|
94
114
|
|
|
95
115
|
# copy target to bin folder
|
|
96
|
-
add_custom_command(TARGET
|
|
116
|
+
add_custom_command(TARGET copy_assets
|
|
97
117
|
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${PROJECT_NAME}> ${PLATFORM_BINARY_DIR}/$<TARGET_FILE_NAME:${PROJECT_NAME}>
|
|
98
118
|
COMMENT "Copying to bin folder"
|
|
99
119
|
)
|
|
100
120
|
|
|
101
121
|
if (LLAMA_METAL)
|
|
102
122
|
# copy ${CMAKE_BINARY_DIR}/bin/default.metallib
|
|
103
|
-
add_custom_target(copy_metallib)
|
|
104
123
|
add_custom_command(
|
|
105
|
-
TARGET
|
|
124
|
+
TARGET copy_assets
|
|
106
125
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bin/default.metallib ${PLATFORM_BINARY_DIR}/default.metallib
|
|
107
126
|
COMMENT "Copying default.metallib to bin folder"
|
|
108
127
|
)
|
|
109
|
-
add_dependencies(
|
|
110
|
-
|
|
128
|
+
add_dependencies(copy_assets ggml-metal)
|
|
129
|
+
endif()
|
|
130
|
+
|
|
131
|
+
if (LLAMA_CLBLAST)
|
|
132
|
+
find_package(CLBlast)
|
|
133
|
+
if (CLBlast_FOUND)
|
|
134
|
+
message(STATUS "CLBlast found: ${CLBlast_DIR}")
|
|
135
|
+
file(
|
|
136
|
+
GLOB CLBlast_SO_FILES
|
|
137
|
+
${CLBlast_DIR}/../../../bin/clblast.dll
|
|
138
|
+
${CLBlast_DIR}/../../../lib/libclblast.so
|
|
139
|
+
)
|
|
140
|
+
add_custom_command(
|
|
141
|
+
TARGET copy_assets
|
|
142
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${CLBlast_SO_FILES} ${PLATFORM_BINARY_DIR}
|
|
143
|
+
COMMENT "Copying CLBlast SO files to bin folder"
|
|
144
|
+
)
|
|
145
|
+
endif()
|
|
111
146
|
endif()
|
package/README.md
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# llama.node
|
|
2
2
|
|
|
3
|
+
[](https://github.com/mybigday/llama.node/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.npmjs.com/package/@fugood/llama.node)
|
|
5
|
+

|
|
6
|
+
|
|
3
7
|
Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
|
|
4
8
|
|
|
5
9
|
[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|
@@ -22,6 +26,7 @@ const context = await loadModel({
|
|
|
22
26
|
n_ctx: 2048,
|
|
23
27
|
n_gpu_layers: 1, // > 0: enable GPU
|
|
24
28
|
// embedding: true, // use embedding
|
|
29
|
+
// lib_variant: 'opencl', // Change backend
|
|
25
30
|
})
|
|
26
31
|
|
|
27
32
|
// Do completion
|
|
@@ -40,6 +45,11 @@ const { text, timings } = await context.completion(
|
|
|
40
45
|
console.log('Result:', text)
|
|
41
46
|
```
|
|
42
47
|
|
|
48
|
+
## Lib Variants
|
|
49
|
+
|
|
50
|
+
- [x] `default`: General usage, not support GPU except macOS (Metal)
|
|
51
|
+
- [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
|
|
52
|
+
|
|
43
53
|
## License
|
|
44
54
|
|
|
45
55
|
MIT
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.js
CHANGED
|
@@ -50,7 +50,7 @@ const setupEnv = (variant) => {
|
|
|
50
50
|
};
|
|
51
51
|
const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
|
|
52
52
|
try {
|
|
53
|
-
if (variant) {
|
|
53
|
+
if (variant && variant !== 'default') {
|
|
54
54
|
setupEnv(variant);
|
|
55
55
|
return yield Promise.resolve(`${`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`}`).then(s => __importStar(require(s)));
|
|
56
56
|
}
|
package/lib/binding.ts
CHANGED
|
@@ -23,6 +23,7 @@ export type LlamaCompletionOptions = {
|
|
|
23
23
|
max_tokens?: number
|
|
24
24
|
seed?: number
|
|
25
25
|
stop?: string[]
|
|
26
|
+
grammar?: string
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
export type LlamaCompletionResult = {
|
|
@@ -36,11 +37,22 @@ export type LlamaCompletionToken = {
|
|
|
36
37
|
token: string
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
export type TokenizeResult = {
|
|
41
|
+
tokens: Int32Array
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export type EmbeddingResult = {
|
|
45
|
+
embedding: Float32Array
|
|
46
|
+
}
|
|
47
|
+
|
|
39
48
|
export interface LlamaContext {
|
|
40
49
|
new (options: LlamaModelOptions): LlamaContext
|
|
41
50
|
getSystemInfo(): string
|
|
42
51
|
completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
|
|
43
52
|
stopCompletion(): void
|
|
53
|
+
tokenize(text: string): Promise<TokenizeResult>
|
|
54
|
+
detokenize(tokens: number[]): Promise<string>
|
|
55
|
+
embedding(text: string): Promise<EmbeddingResult>
|
|
44
56
|
saveSession(path: string): Promise<void>
|
|
45
57
|
loadSession(path: string): Promise<void>
|
|
46
58
|
release(): Promise<void>
|
|
@@ -50,6 +62,8 @@ export interface Module {
|
|
|
50
62
|
LlamaContext: LlamaContext
|
|
51
63
|
}
|
|
52
64
|
|
|
65
|
+
export type LibVariant = 'default' | 'opencl'
|
|
66
|
+
|
|
53
67
|
const setupEnv = (variant?: string) => {
|
|
54
68
|
const postfix = variant ? `-${variant}` : ''
|
|
55
69
|
const binPath = path.resolve(__dirname, `../bin/${process.platform}${postfix}/${process.arch}/`)
|
|
@@ -63,9 +77,9 @@ const setupEnv = (variant?: string) => {
|
|
|
63
77
|
}
|
|
64
78
|
}
|
|
65
79
|
|
|
66
|
-
export const loadModule = async (variant?:
|
|
80
|
+
export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
67
81
|
try {
|
|
68
|
-
if (variant) {
|
|
82
|
+
if (variant && variant !== 'default') {
|
|
69
83
|
setupEnv(variant)
|
|
70
84
|
return await import(`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`) as Module
|
|
71
85
|
}
|
package/lib/index.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { loadModule, LlamaModelOptions } from './binding'
|
|
2
|
-
import type { Module, LlamaContext } from './binding'
|
|
2
|
+
import type { Module, LlamaContext, LibVariant } from './binding'
|
|
3
3
|
|
|
4
4
|
export * from './binding'
|
|
5
5
|
|
|
6
6
|
export interface LlamaModelOptionsExtended extends LlamaModelOptions {
|
|
7
|
-
lib_variant?:
|
|
7
|
+
lib_variant?: LibVariant
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
const mods: { [key: string]: Module } = {}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "0.0
|
|
4
|
+
"version": "0.2.0",
|
|
5
5
|
"description": "Llama.cpp for Node.js",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -39,8 +39,6 @@
|
|
|
39
39
|
},
|
|
40
40
|
"files": [
|
|
41
41
|
"bin/**/*",
|
|
42
|
-
"scripts/*.js",
|
|
43
|
-
"scripts/*.ts",
|
|
44
42
|
"src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
|
|
45
43
|
"lib/*.js",
|
|
46
44
|
"lib/*.ts",
|
|
@@ -62,5 +60,19 @@
|
|
|
62
60
|
},
|
|
63
61
|
"dependencies": {
|
|
64
62
|
"node-addon-api": "^8.0.0"
|
|
63
|
+
},
|
|
64
|
+
"jest": {
|
|
65
|
+
"testEnvironment": "node",
|
|
66
|
+
"moduleFileExtensions": [
|
|
67
|
+
"ts",
|
|
68
|
+
"tsx",
|
|
69
|
+
"js",
|
|
70
|
+
"jsx",
|
|
71
|
+
"json",
|
|
72
|
+
"node"
|
|
73
|
+
],
|
|
74
|
+
"testMatch": [
|
|
75
|
+
"**/*.test.ts"
|
|
76
|
+
]
|
|
65
77
|
}
|
|
66
78
|
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#include "DetokenizeWorker.h"
|
|
2
|
+
#include "LlamaContext.h"
|
|
3
|
+
|
|
4
|
+
DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
|
|
5
|
+
LlamaSessionPtr &sess,
|
|
6
|
+
std::vector<llama_token> &tokens)
|
|
7
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
8
|
+
_tokens(std::move(tokens)) {}
|
|
9
|
+
|
|
10
|
+
void DetokenizeWorker::Execute() {
|
|
11
|
+
const auto text = ::llama_detokenize_bpe(_sess->context(), _tokens);
|
|
12
|
+
_text = std::move(text);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
void DetokenizeWorker::OnOK() {
|
|
16
|
+
Napi::Promise::Deferred::Resolve(
|
|
17
|
+
Napi::String::New(Napi::AsyncWorker::Env(), _text));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
void DetokenizeWorker::OnError(const Napi::Error &err) {
|
|
21
|
+
Napi::Promise::Deferred::Reject(err.Value());
|
|
22
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
class DetokenizeWorker : public Napi::AsyncWorker,
|
|
5
|
+
public Napi::Promise::Deferred {
|
|
6
|
+
public:
|
|
7
|
+
DetokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
8
|
+
std::vector<llama_token> &tokens);
|
|
9
|
+
|
|
10
|
+
protected:
|
|
11
|
+
void Execute();
|
|
12
|
+
void OnOK();
|
|
13
|
+
void OnError(const Napi::Error &err);
|
|
14
|
+
|
|
15
|
+
private:
|
|
16
|
+
LlamaSessionPtr _sess;
|
|
17
|
+
std::vector<llama_token> _tokens;
|
|
18
|
+
std::string _text;
|
|
19
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#include "EmbeddingWorker.h"
|
|
2
|
+
#include "LlamaContext.h"
|
|
3
|
+
|
|
4
|
+
EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
5
|
+
LlamaSessionPtr &sess, std::string text)
|
|
6
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
|
|
7
|
+
|
|
8
|
+
void EmbeddingWorker::Execute() {
|
|
9
|
+
llama_kv_cache_clear(_sess->context());
|
|
10
|
+
auto tokens = ::llama_tokenize(_sess->context(), _text, true);
|
|
11
|
+
// add SEP if not present
|
|
12
|
+
if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
|
|
13
|
+
tokens.push_back(llama_token_sep(_sess->model()));
|
|
14
|
+
}
|
|
15
|
+
const int n_embd = llama_n_embd(_sess->model());
|
|
16
|
+
do {
|
|
17
|
+
int ret =
|
|
18
|
+
llama_decode(_sess->context(),
|
|
19
|
+
llama_batch_get_one(tokens.data(), tokens.size(), 0, 0));
|
|
20
|
+
if (ret < 0) {
|
|
21
|
+
SetError("Failed to inference, code: " + std::to_string(ret));
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
const float *embd = llama_get_embeddings_seq(_sess->context(), 0);
|
|
25
|
+
if (embd == nullptr) {
|
|
26
|
+
SetError("Failed to get embeddings");
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
_result.embedding.resize(n_embd);
|
|
30
|
+
memcpy(_result.embedding.data(), embd, n_embd * sizeof(float));
|
|
31
|
+
} while (false);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
void EmbeddingWorker::OnOK() {
|
|
35
|
+
auto result = Napi::Object::New(Napi::AsyncWorker::Env());
|
|
36
|
+
auto embedding = Napi::Float32Array::New(Napi::AsyncWorker::Env(),
|
|
37
|
+
_result.embedding.size());
|
|
38
|
+
memcpy(embedding.Data(), _result.embedding.data(),
|
|
39
|
+
_result.embedding.size() * sizeof(float));
|
|
40
|
+
result.Set("embedding", embedding);
|
|
41
|
+
Napi::Promise::Deferred::Resolve(result);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
void EmbeddingWorker::OnError(const Napi::Error &err) {
|
|
45
|
+
Napi::Promise::Deferred::Reject(err.Value());
|
|
46
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
struct EmbeddingResult {
|
|
5
|
+
std::vector<float> embedding;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
class EmbeddingWorker : public Napi::AsyncWorker,
|
|
9
|
+
public Napi::Promise::Deferred {
|
|
10
|
+
public:
|
|
11
|
+
EmbeddingWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
12
|
+
std::string text);
|
|
13
|
+
|
|
14
|
+
protected:
|
|
15
|
+
void Execute();
|
|
16
|
+
void OnOK();
|
|
17
|
+
void OnError(const Napi::Error &err);
|
|
18
|
+
|
|
19
|
+
private:
|
|
20
|
+
LlamaSessionPtr _sess;
|
|
21
|
+
std::string _text;
|
|
22
|
+
EmbeddingResult _result;
|
|
23
|
+
};
|
|
@@ -58,7 +58,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
58
58
|
const auto n_keep = _params.n_keep;
|
|
59
59
|
size_t n_cur = 0;
|
|
60
60
|
size_t n_input = 0;
|
|
61
|
-
const auto model =
|
|
61
|
+
const auto model = _sess->model();
|
|
62
62
|
const bool add_bos = llama_should_add_bos_token(model);
|
|
63
63
|
auto ctx = _sess->context();
|
|
64
64
|
|
|
@@ -110,6 +110,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
110
110
|
// sample the next token
|
|
111
111
|
const llama_token new_token_id =
|
|
112
112
|
llama_sampling_sample(sampling.get(), ctx, nullptr);
|
|
113
|
+
llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
|
|
113
114
|
// prepare the next batch
|
|
114
115
|
embd->emplace_back(new_token_id);
|
|
115
116
|
auto token = llama_token_to_piece(ctx, new_token_id);
|
|
@@ -143,6 +144,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
143
144
|
}
|
|
144
145
|
const auto t_main_end = ggml_time_us();
|
|
145
146
|
_sess->get_mutex().unlock();
|
|
147
|
+
if (_onComplete) {
|
|
148
|
+
_onComplete();
|
|
149
|
+
}
|
|
146
150
|
}
|
|
147
151
|
|
|
148
152
|
void LlamaCompletionWorker::OnOK() {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "common.hpp"
|
|
2
|
+
#include <functional>
|
|
2
3
|
|
|
3
4
|
struct CompletionResult {
|
|
4
5
|
std::string text = "";
|
|
@@ -18,6 +19,8 @@ public:
|
|
|
18
19
|
|
|
19
20
|
inline void Stop() { _stop = true; }
|
|
20
21
|
|
|
22
|
+
inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
|
|
23
|
+
|
|
21
24
|
protected:
|
|
22
25
|
void Execute();
|
|
23
26
|
void OnOK();
|
|
@@ -30,5 +33,6 @@ private:
|
|
|
30
33
|
Napi::ThreadSafeFunction _tsfn;
|
|
31
34
|
bool _has_callback = false;
|
|
32
35
|
bool _stop = false;
|
|
36
|
+
std::function<void()> _onComplete;
|
|
33
37
|
CompletionResult _result;
|
|
34
38
|
};
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
#include "LlamaContext.h"
|
|
2
|
+
#include "DetokenizeWorker.h"
|
|
2
3
|
#include "DisposeWorker.h"
|
|
4
|
+
#include "EmbeddingWorker.h"
|
|
3
5
|
#include "LlamaCompletionWorker.h"
|
|
4
6
|
#include "LoadSessionWorker.h"
|
|
5
7
|
#include "SaveSessionWorker.h"
|
|
8
|
+
#include "TokenizeWorker.h"
|
|
6
9
|
|
|
7
10
|
void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
8
11
|
Napi::Function func = DefineClass(
|
|
@@ -16,6 +19,13 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
16
19
|
InstanceMethod<&LlamaContext::StopCompletion>(
|
|
17
20
|
"stopCompletion",
|
|
18
21
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
22
|
+
InstanceMethod<&LlamaContext::Tokenize>(
|
|
23
|
+
"tokenize", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
24
|
+
InstanceMethod<&LlamaContext::Detokenize>(
|
|
25
|
+
"detokenize",
|
|
26
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
27
|
+
InstanceMethod<&LlamaContext::Embedding>(
|
|
28
|
+
"embedding", static_cast<napi_property_attributes>(napi_enumerable)),
|
|
19
29
|
InstanceMethod<&LlamaContext::SaveSession>(
|
|
20
30
|
"saveSession",
|
|
21
31
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
@@ -70,7 +80,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
70
80
|
.ThrowAsJavaScriptException();
|
|
71
81
|
}
|
|
72
82
|
|
|
73
|
-
_sess = std::make_shared<LlamaSession>(ctx, params);
|
|
83
|
+
_sess = std::make_shared<LlamaSession>(model, ctx, params);
|
|
74
84
|
_info = get_system_info(params);
|
|
75
85
|
}
|
|
76
86
|
|
|
@@ -93,6 +103,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
93
103
|
Napi::TypeError::New(env, "Context is disposed")
|
|
94
104
|
.ThrowAsJavaScriptException();
|
|
95
105
|
}
|
|
106
|
+
if (_wip != nullptr) {
|
|
107
|
+
Napi::TypeError::New(env, "Another completion is in progress")
|
|
108
|
+
.ThrowAsJavaScriptException();
|
|
109
|
+
}
|
|
96
110
|
auto options = info[0].As<Napi::Object>();
|
|
97
111
|
|
|
98
112
|
gpt_params params = _sess->params();
|
|
@@ -143,6 +157,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
143
157
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words);
|
|
144
158
|
worker->Queue();
|
|
145
159
|
_wip = worker;
|
|
160
|
+
worker->onComplete([this]() { _wip = nullptr; });
|
|
146
161
|
return worker->Promise();
|
|
147
162
|
}
|
|
148
163
|
|
|
@@ -153,6 +168,58 @@ void LlamaContext::StopCompletion(const Napi::CallbackInfo &info) {
|
|
|
153
168
|
}
|
|
154
169
|
}
|
|
155
170
|
|
|
171
|
+
// tokenize(text: string): Promise<TokenizeResult>
|
|
172
|
+
Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
|
|
173
|
+
Napi::Env env = info.Env();
|
|
174
|
+
if (info.Length() < 1 || !info[0].IsString()) {
|
|
175
|
+
Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
|
|
176
|
+
}
|
|
177
|
+
if (_sess == nullptr) {
|
|
178
|
+
Napi::TypeError::New(env, "Context is disposed")
|
|
179
|
+
.ThrowAsJavaScriptException();
|
|
180
|
+
}
|
|
181
|
+
auto text = info[0].ToString().Utf8Value();
|
|
182
|
+
auto *worker = new TokenizeWorker(info, _sess, text);
|
|
183
|
+
worker->Queue();
|
|
184
|
+
return worker->Promise();
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// detokenize(tokens: number[]): Promise<string>
|
|
188
|
+
Napi::Value LlamaContext::Detokenize(const Napi::CallbackInfo &info) {
|
|
189
|
+
Napi::Env env = info.Env();
|
|
190
|
+
if (info.Length() < 1 || !info[0].IsArray()) {
|
|
191
|
+
Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
|
|
192
|
+
}
|
|
193
|
+
if (_sess == nullptr) {
|
|
194
|
+
Napi::TypeError::New(env, "Context is disposed")
|
|
195
|
+
.ThrowAsJavaScriptException();
|
|
196
|
+
}
|
|
197
|
+
auto tokens = info[0].As<Napi::Array>();
|
|
198
|
+
std::vector<int32_t> token_ids;
|
|
199
|
+
for (size_t i = 0; i < tokens.Length(); i++) {
|
|
200
|
+
token_ids.push_back(tokens.Get(i).ToNumber().Int32Value());
|
|
201
|
+
}
|
|
202
|
+
auto *worker = new DetokenizeWorker(info, _sess, token_ids);
|
|
203
|
+
worker->Queue();
|
|
204
|
+
return worker->Promise();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// embedding(text: string): Promise<EmbeddingResult>
|
|
208
|
+
Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
|
|
209
|
+
Napi::Env env = info.Env();
|
|
210
|
+
if (info.Length() < 1 || !info[0].IsString()) {
|
|
211
|
+
Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
|
|
212
|
+
}
|
|
213
|
+
if (_sess == nullptr) {
|
|
214
|
+
Napi::TypeError::New(env, "Context is disposed")
|
|
215
|
+
.ThrowAsJavaScriptException();
|
|
216
|
+
}
|
|
217
|
+
auto text = info[0].ToString().Utf8Value();
|
|
218
|
+
auto *worker = new EmbeddingWorker(info, _sess, text);
|
|
219
|
+
worker->Queue();
|
|
220
|
+
return worker->Promise();
|
|
221
|
+
}
|
|
222
|
+
|
|
156
223
|
// saveSession(path: string): Promise<void> throws error
|
|
157
224
|
Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
|
|
158
225
|
Napi::Env env = info.Env();
|
|
@@ -163,6 +230,12 @@ Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
|
|
|
163
230
|
Napi::TypeError::New(env, "Context is disposed")
|
|
164
231
|
.ThrowAsJavaScriptException();
|
|
165
232
|
}
|
|
233
|
+
#ifdef GGML_USE_VULKAN
|
|
234
|
+
if (_sess->params().n_gpu_layers > 0) {
|
|
235
|
+
Napi::TypeError::New(env, "Vulkan cannot save session")
|
|
236
|
+
.ThrowAsJavaScriptException();
|
|
237
|
+
}
|
|
238
|
+
#endif
|
|
166
239
|
auto *worker = new SaveSessionWorker(info, _sess);
|
|
167
240
|
worker->Queue();
|
|
168
241
|
return worker->Promise();
|
|
@@ -178,6 +251,12 @@ Napi::Value LlamaContext::LoadSession(const Napi::CallbackInfo &info) {
|
|
|
178
251
|
Napi::TypeError::New(env, "Context is disposed")
|
|
179
252
|
.ThrowAsJavaScriptException();
|
|
180
253
|
}
|
|
254
|
+
#ifdef GGML_USE_VULKAN
|
|
255
|
+
if (_sess->params().n_gpu_layers > 0) {
|
|
256
|
+
Napi::TypeError::New(env, "Vulkan cannot load session")
|
|
257
|
+
.ThrowAsJavaScriptException();
|
|
258
|
+
}
|
|
259
|
+
#endif
|
|
181
260
|
auto *worker = new LoadSessionWorker(info, _sess);
|
|
182
261
|
worker->Queue();
|
|
183
262
|
return worker->Promise();
|
package/src/LlamaContext.h
CHANGED
|
@@ -11,6 +11,9 @@ private:
|
|
|
11
11
|
Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
|
|
12
12
|
Napi::Value Completion(const Napi::CallbackInfo &info);
|
|
13
13
|
void StopCompletion(const Napi::CallbackInfo &info);
|
|
14
|
+
Napi::Value Tokenize(const Napi::CallbackInfo &info);
|
|
15
|
+
Napi::Value Detokenize(const Napi::CallbackInfo &info);
|
|
16
|
+
Napi::Value Embedding(const Napi::CallbackInfo &info);
|
|
14
17
|
Napi::Value SaveSession(const Napi::CallbackInfo &info);
|
|
15
18
|
Napi::Value LoadSession(const Napi::CallbackInfo &info);
|
|
16
19
|
Napi::Value Release(const Napi::CallbackInfo &info);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#include "TokenizeWorker.h"
|
|
2
|
+
#include "LlamaContext.h"
|
|
3
|
+
|
|
4
|
+
TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
|
|
5
|
+
LlamaSessionPtr &sess, std::string text)
|
|
6
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
|
|
7
|
+
|
|
8
|
+
void TokenizeWorker::Execute() {
|
|
9
|
+
const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
|
|
10
|
+
_result = {.tokens = std::move(tokens)};
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
void TokenizeWorker::OnOK() {
|
|
14
|
+
Napi::HandleScope scope(Napi::AsyncWorker::Env());
|
|
15
|
+
auto result = Napi::Object::New(Napi::AsyncWorker::Env());
|
|
16
|
+
auto tokens =
|
|
17
|
+
Napi::Int32Array::New(Napi::AsyncWorker::Env(), _result.tokens.size());
|
|
18
|
+
memcpy(tokens.Data(), _result.tokens.data(),
|
|
19
|
+
_result.tokens.size() * sizeof(llama_token));
|
|
20
|
+
result.Set("tokens", tokens);
|
|
21
|
+
Napi::Promise::Deferred::Resolve(result);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void TokenizeWorker::OnError(const Napi::Error &err) {
|
|
25
|
+
Napi::Promise::Deferred::Reject(err.Value());
|
|
26
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
struct TokenizeResult {
|
|
5
|
+
std::vector<llama_token> tokens;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
class TokenizeWorker : public Napi::AsyncWorker,
|
|
9
|
+
public Napi::Promise::Deferred {
|
|
10
|
+
public:
|
|
11
|
+
TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
12
|
+
std::string text);
|
|
13
|
+
|
|
14
|
+
protected:
|
|
15
|
+
void Execute();
|
|
16
|
+
void OnOK();
|
|
17
|
+
void OnError(const Napi::Error &err);
|
|
18
|
+
|
|
19
|
+
private:
|
|
20
|
+
LlamaSessionPtr _sess;
|
|
21
|
+
std::string _text;
|
|
22
|
+
TokenizeResult _result;
|
|
23
|
+
};
|
package/src/common.hpp
CHANGED
|
@@ -46,32 +46,37 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
|
|
|
46
46
|
|
|
47
47
|
class LlamaSession {
|
|
48
48
|
public:
|
|
49
|
-
LlamaSession(llama_context *ctx, gpt_params params)
|
|
50
|
-
:
|
|
49
|
+
LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
|
|
50
|
+
: model_(LlamaCppModel(model, llama_free_model)),
|
|
51
|
+
ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
|
|
51
52
|
tokens_.reserve(params.n_ctx);
|
|
52
53
|
}
|
|
53
54
|
|
|
54
55
|
~LlamaSession() { dispose(); }
|
|
55
56
|
|
|
56
|
-
llama_context *context() { return ctx_.get(); }
|
|
57
|
+
inline llama_context *context() { return ctx_.get(); }
|
|
57
58
|
|
|
58
|
-
|
|
59
|
+
inline llama_model *model() { return model_.get(); }
|
|
59
60
|
|
|
60
|
-
|
|
61
|
+
inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
|
|
62
|
+
|
|
63
|
+
inline void set_tokens(std::vector<llama_token> tokens) {
|
|
61
64
|
tokens_ = std::move(tokens);
|
|
62
65
|
}
|
|
63
66
|
|
|
64
|
-
const gpt_params ¶ms() const { return params_; }
|
|
67
|
+
inline const gpt_params ¶ms() const { return params_; }
|
|
65
68
|
|
|
66
|
-
std::mutex &get_mutex() { return mutex; }
|
|
69
|
+
inline std::mutex &get_mutex() { return mutex; }
|
|
67
70
|
|
|
68
71
|
void dispose() {
|
|
69
72
|
std::lock_guard<std::mutex> lock(mutex);
|
|
70
73
|
tokens_.clear();
|
|
71
74
|
ctx_.reset();
|
|
75
|
+
model_.reset();
|
|
72
76
|
}
|
|
73
77
|
|
|
74
78
|
private:
|
|
79
|
+
LlamaCppModel model_;
|
|
75
80
|
LlamaCppContext ctx_;
|
|
76
81
|
const gpt_params params_;
|
|
77
82
|
std::vector<llama_token> tokens_{};
|