@fugood/llama.node 0.0.1-alpha.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +5 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
package/CMakeLists.txt
CHANGED
|
@@ -26,7 +26,7 @@ string(REPLACE "i686" "ia32" ARCH ${ARCH})
|
|
|
26
26
|
string(REPLACE "i386" "ia32" ARCH ${ARCH})
|
|
27
27
|
string(REPLACE "armv7l" "arm" ARCH ${ARCH})
|
|
28
28
|
string(REPLACE "arm" "arm" ARCH ${ARCH})
|
|
29
|
-
string(REPLACE "
|
|
29
|
+
string(REPLACE "arm64x" "arm64" ARCH ${ARCH})
|
|
30
30
|
string(REPLACE "aarch64" "arm64" ARCH ${ARCH})
|
|
31
31
|
|
|
32
32
|
if(DEFINED VARIANT)
|
|
@@ -58,6 +58,12 @@ include_directories(${CMAKE_JS_INC})
|
|
|
58
58
|
# flags: -fPIC
|
|
59
59
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
60
60
|
|
|
61
|
+
# VULKAN_SDK
|
|
62
|
+
if (VULKAN_SDK)
|
|
63
|
+
set(ENV{VULKAN_SDK} ${VULKAN_SDK})
|
|
64
|
+
find_package(Vulkan REQUIRED)
|
|
65
|
+
endif()
|
|
66
|
+
|
|
61
67
|
set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
|
|
62
68
|
add_subdirectory("src/llama.cpp")
|
|
63
69
|
|
|
@@ -81,31 +87,54 @@ add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
|
|
|
81
87
|
set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
|
|
82
88
|
target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common)
|
|
83
89
|
|
|
90
|
+
add_custom_target(copy_assets ALL DEPENDS ${PROJECT_NAME})
|
|
91
|
+
|
|
92
|
+
add_custom_command(
|
|
93
|
+
TARGET copy_assets
|
|
94
|
+
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PLATFORM_BINARY_DIR}
|
|
95
|
+
COMMENT "Cleaning bin folder"
|
|
96
|
+
)
|
|
97
|
+
|
|
84
98
|
if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
|
|
85
99
|
# Generate node.lib
|
|
86
100
|
execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
|
|
87
101
|
# copy target to bin folder
|
|
88
102
|
get_filename_component(CMAKE_JS_NODELIB_TARGET_NAME ${CMAKE_JS_NODELIB_TARGET} NAME)
|
|
89
|
-
add_custom_command(TARGET
|
|
103
|
+
add_custom_command(TARGET copy_assets
|
|
90
104
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_NODELIB_TARGET} ${PLATFORM_BINARY_DIR}/${CMAKE_JS_NODELIB_TARGET_NAME}
|
|
91
105
|
COMMENT "Copying to bin folder"
|
|
92
106
|
)
|
|
93
107
|
endif()
|
|
94
108
|
|
|
95
109
|
# copy target to bin folder
|
|
96
|
-
add_custom_command(TARGET
|
|
110
|
+
add_custom_command(TARGET copy_assets
|
|
97
111
|
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${PROJECT_NAME}> ${PLATFORM_BINARY_DIR}/$<TARGET_FILE_NAME:${PROJECT_NAME}>
|
|
98
112
|
COMMENT "Copying to bin folder"
|
|
99
113
|
)
|
|
100
114
|
|
|
101
115
|
if (LLAMA_METAL)
|
|
102
116
|
# copy ${CMAKE_BINARY_DIR}/bin/default.metallib
|
|
103
|
-
add_custom_target(copy_metallib)
|
|
104
117
|
add_custom_command(
|
|
105
|
-
TARGET
|
|
118
|
+
TARGET copy_assets
|
|
106
119
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bin/default.metallib ${PLATFORM_BINARY_DIR}/default.metallib
|
|
107
120
|
COMMENT "Copying default.metallib to bin folder"
|
|
108
121
|
)
|
|
109
|
-
add_dependencies(
|
|
110
|
-
|
|
122
|
+
add_dependencies(copy_assets ggml-metal)
|
|
123
|
+
endif()
|
|
124
|
+
|
|
125
|
+
if (LLAMA_CLBLAST)
|
|
126
|
+
find_package(CLBlast)
|
|
127
|
+
if (CLBlast_FOUND)
|
|
128
|
+
message(STATUS "CLBlast found: ${CLBlast_DIR}")
|
|
129
|
+
file(
|
|
130
|
+
GLOB CLBlast_SO_FILES
|
|
131
|
+
${CLBlast_DIR}/../../../bin/clblast.dll
|
|
132
|
+
${CLBlast_DIR}/../../../lib/libclblast.so
|
|
133
|
+
)
|
|
134
|
+
add_custom_command(
|
|
135
|
+
TARGET copy_assets
|
|
136
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${CLBlast_SO_FILES} ${PLATFORM_BINARY_DIR}
|
|
137
|
+
COMMENT "Copying CLBlast SO files to bin folder"
|
|
138
|
+
)
|
|
139
|
+
endif()
|
|
111
140
|
endif()
|
package/README.md
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# llama.node
|
|
2
2
|
|
|
3
|
+
[](https://github.com/mybigday/llama.node/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.npmjs.com/package/@fugood/llama.node)
|
|
5
|
+

|
|
6
|
+
|
|
3
7
|
Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
|
|
4
8
|
|
|
5
9
|
[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|
@@ -22,6 +26,7 @@ const context = await loadModel({
|
|
|
22
26
|
n_ctx: 2048,
|
|
23
27
|
n_gpu_layers: 1, // > 0: enable GPU
|
|
24
28
|
// embedding: true, // use embedding
|
|
29
|
+
// lib_variant: 'opencl', // Change backend
|
|
25
30
|
})
|
|
26
31
|
|
|
27
32
|
// Do completion
|
|
@@ -40,6 +45,10 @@ const { text, timings } = await context.completion(
|
|
|
40
45
|
console.log('Result:', text)
|
|
41
46
|
```
|
|
42
47
|
|
|
48
|
+
## Lib Variants
|
|
49
|
+
|
|
50
|
+
- [x] `default`: General usage, Supported GPU: Metal (macOS) and Vulkan (Linux / Windows)
|
|
51
|
+
|
|
43
52
|
## License
|
|
44
53
|
|
|
45
54
|
MIT
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
package/lib/binding.js
CHANGED
|
@@ -50,7 +50,7 @@ const setupEnv = (variant) => {
|
|
|
50
50
|
};
|
|
51
51
|
const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
|
|
52
52
|
try {
|
|
53
|
-
if (variant) {
|
|
53
|
+
if (variant && variant !== 'default') {
|
|
54
54
|
setupEnv(variant);
|
|
55
55
|
return yield Promise.resolve(`${`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`}`).then(s => __importStar(require(s)));
|
|
56
56
|
}
|
package/lib/binding.ts
CHANGED
|
@@ -23,6 +23,7 @@ export type LlamaCompletionOptions = {
|
|
|
23
23
|
max_tokens?: number
|
|
24
24
|
seed?: number
|
|
25
25
|
stop?: string[]
|
|
26
|
+
grammar?: string
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
export type LlamaCompletionResult = {
|
|
@@ -50,6 +51,8 @@ export interface Module {
|
|
|
50
51
|
LlamaContext: LlamaContext
|
|
51
52
|
}
|
|
52
53
|
|
|
54
|
+
export type LibVariant = 'default' | 'opencl'
|
|
55
|
+
|
|
53
56
|
const setupEnv = (variant?: string) => {
|
|
54
57
|
const postfix = variant ? `-${variant}` : ''
|
|
55
58
|
const binPath = path.resolve(__dirname, `../bin/${process.platform}${postfix}/${process.arch}/`)
|
|
@@ -63,9 +66,9 @@ const setupEnv = (variant?: string) => {
|
|
|
63
66
|
}
|
|
64
67
|
}
|
|
65
68
|
|
|
66
|
-
export const loadModule = async (variant?:
|
|
69
|
+
export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
67
70
|
try {
|
|
68
|
-
if (variant) {
|
|
71
|
+
if (variant && variant !== 'default') {
|
|
69
72
|
setupEnv(variant)
|
|
70
73
|
return await import(`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`) as Module
|
|
71
74
|
}
|
package/lib/index.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { loadModule, LlamaModelOptions } from './binding'
|
|
2
|
-
import type { Module, LlamaContext } from './binding'
|
|
2
|
+
import type { Module, LlamaContext, LibVariant } from './binding'
|
|
3
3
|
|
|
4
4
|
export * from './binding'
|
|
5
5
|
|
|
6
6
|
export interface LlamaModelOptionsExtended extends LlamaModelOptions {
|
|
7
|
-
lib_variant?:
|
|
7
|
+
lib_variant?: LibVariant
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
const mods: { [key: string]: Module } = {}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.1.0",
|
|
5
5
|
"description": "Llama.cpp for Node.js",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -39,8 +39,6 @@
|
|
|
39
39
|
},
|
|
40
40
|
"files": [
|
|
41
41
|
"bin/**/*",
|
|
42
|
-
"scripts/*.js",
|
|
43
|
-
"scripts/*.ts",
|
|
44
42
|
"src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
|
|
45
43
|
"lib/*.js",
|
|
46
44
|
"lib/*.ts",
|
|
@@ -62,5 +60,19 @@
|
|
|
62
60
|
},
|
|
63
61
|
"dependencies": {
|
|
64
62
|
"node-addon-api": "^8.0.0"
|
|
63
|
+
},
|
|
64
|
+
"jest": {
|
|
65
|
+
"testEnvironment": "node",
|
|
66
|
+
"moduleFileExtensions": [
|
|
67
|
+
"ts",
|
|
68
|
+
"tsx",
|
|
69
|
+
"js",
|
|
70
|
+
"jsx",
|
|
71
|
+
"json",
|
|
72
|
+
"node"
|
|
73
|
+
],
|
|
74
|
+
"testMatch": [
|
|
75
|
+
"**/*.test.ts"
|
|
76
|
+
]
|
|
65
77
|
}
|
|
66
78
|
}
|
|
@@ -58,7 +58,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
58
58
|
const auto n_keep = _params.n_keep;
|
|
59
59
|
size_t n_cur = 0;
|
|
60
60
|
size_t n_input = 0;
|
|
61
|
-
const auto model =
|
|
61
|
+
const auto model = _sess->model();
|
|
62
62
|
const bool add_bos = llama_should_add_bos_token(model);
|
|
63
63
|
auto ctx = _sess->context();
|
|
64
64
|
|
|
@@ -110,6 +110,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
110
110
|
// sample the next token
|
|
111
111
|
const llama_token new_token_id =
|
|
112
112
|
llama_sampling_sample(sampling.get(), ctx, nullptr);
|
|
113
|
+
llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
|
|
113
114
|
// prepare the next batch
|
|
114
115
|
embd->emplace_back(new_token_id);
|
|
115
116
|
auto token = llama_token_to_piece(ctx, new_token_id);
|
|
@@ -143,6 +144,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
143
144
|
}
|
|
144
145
|
const auto t_main_end = ggml_time_us();
|
|
145
146
|
_sess->get_mutex().unlock();
|
|
147
|
+
if (_onComplete) {
|
|
148
|
+
_onComplete();
|
|
149
|
+
}
|
|
146
150
|
}
|
|
147
151
|
|
|
148
152
|
void LlamaCompletionWorker::OnOK() {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "common.hpp"
|
|
2
|
+
#include <functional>
|
|
2
3
|
|
|
3
4
|
struct CompletionResult {
|
|
4
5
|
std::string text = "";
|
|
@@ -18,6 +19,8 @@ public:
|
|
|
18
19
|
|
|
19
20
|
inline void Stop() { _stop = true; }
|
|
20
21
|
|
|
22
|
+
inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
|
|
23
|
+
|
|
21
24
|
protected:
|
|
22
25
|
void Execute();
|
|
23
26
|
void OnOK();
|
|
@@ -30,5 +33,6 @@ private:
|
|
|
30
33
|
Napi::ThreadSafeFunction _tsfn;
|
|
31
34
|
bool _has_callback = false;
|
|
32
35
|
bool _stop = false;
|
|
36
|
+
std::function<void()> _onComplete;
|
|
33
37
|
CompletionResult _result;
|
|
34
38
|
};
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -70,7 +70,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
70
70
|
.ThrowAsJavaScriptException();
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
_sess = std::make_shared<LlamaSession>(ctx, params);
|
|
73
|
+
_sess = std::make_shared<LlamaSession>(model, ctx, params);
|
|
74
74
|
_info = get_system_info(params);
|
|
75
75
|
}
|
|
76
76
|
|
|
@@ -93,6 +93,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
93
93
|
Napi::TypeError::New(env, "Context is disposed")
|
|
94
94
|
.ThrowAsJavaScriptException();
|
|
95
95
|
}
|
|
96
|
+
if (_wip != nullptr) {
|
|
97
|
+
Napi::TypeError::New(env, "Another completion is in progress")
|
|
98
|
+
.ThrowAsJavaScriptException();
|
|
99
|
+
}
|
|
96
100
|
auto options = info[0].As<Napi::Object>();
|
|
97
101
|
|
|
98
102
|
gpt_params params = _sess->params();
|
|
@@ -143,6 +147,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
143
147
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words);
|
|
144
148
|
worker->Queue();
|
|
145
149
|
_wip = worker;
|
|
150
|
+
worker->onComplete([this]() { _wip = nullptr; });
|
|
146
151
|
return worker->Promise();
|
|
147
152
|
}
|
|
148
153
|
|
|
@@ -163,6 +168,12 @@ Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
|
|
|
163
168
|
Napi::TypeError::New(env, "Context is disposed")
|
|
164
169
|
.ThrowAsJavaScriptException();
|
|
165
170
|
}
|
|
171
|
+
#ifdef GGML_USE_VULKAN
|
|
172
|
+
if (_sess->params().n_gpu_layers > 0) {
|
|
173
|
+
Napi::TypeError::New(env, "Vulkan cannot save session")
|
|
174
|
+
.ThrowAsJavaScriptException();
|
|
175
|
+
}
|
|
176
|
+
#endif
|
|
166
177
|
auto *worker = new SaveSessionWorker(info, _sess);
|
|
167
178
|
worker->Queue();
|
|
168
179
|
return worker->Promise();
|
|
@@ -178,6 +189,12 @@ Napi::Value LlamaContext::LoadSession(const Napi::CallbackInfo &info) {
|
|
|
178
189
|
Napi::TypeError::New(env, "Context is disposed")
|
|
179
190
|
.ThrowAsJavaScriptException();
|
|
180
191
|
}
|
|
192
|
+
#ifdef GGML_USE_VULKAN
|
|
193
|
+
if (_sess->params().n_gpu_layers > 0) {
|
|
194
|
+
Napi::TypeError::New(env, "Vulkan cannot load session")
|
|
195
|
+
.ThrowAsJavaScriptException();
|
|
196
|
+
}
|
|
197
|
+
#endif
|
|
181
198
|
auto *worker = new LoadSessionWorker(info, _sess);
|
|
182
199
|
worker->Queue();
|
|
183
200
|
return worker->Promise();
|
package/src/common.hpp
CHANGED
|
@@ -46,32 +46,36 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
|
|
|
46
46
|
|
|
47
47
|
class LlamaSession {
|
|
48
48
|
public:
|
|
49
|
-
LlamaSession(llama_context *ctx, gpt_params params)
|
|
50
|
-
: ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
|
|
49
|
+
LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
|
|
50
|
+
: model_(LlamaCppModel(model, llama_free_model)), ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
|
|
51
51
|
tokens_.reserve(params.n_ctx);
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
~LlamaSession() { dispose(); }
|
|
55
55
|
|
|
56
|
-
llama_context *context() { return ctx_.get(); }
|
|
56
|
+
inline llama_context *context() { return ctx_.get(); }
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
inline llama_model *model() { return model_.get(); }
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
inline std::vector<llama_token>* tokens_ptr() { return &tokens_; }
|
|
61
|
+
|
|
62
|
+
inline void set_tokens(std::vector<llama_token> tokens) {
|
|
61
63
|
tokens_ = std::move(tokens);
|
|
62
64
|
}
|
|
63
65
|
|
|
64
|
-
const gpt_params ¶ms() const { return params_; }
|
|
66
|
+
inline const gpt_params ¶ms() const { return params_; }
|
|
65
67
|
|
|
66
|
-
std::mutex &get_mutex() { return mutex; }
|
|
68
|
+
inline std::mutex &get_mutex() { return mutex; }
|
|
67
69
|
|
|
68
70
|
void dispose() {
|
|
69
71
|
std::lock_guard<std::mutex> lock(mutex);
|
|
70
72
|
tokens_.clear();
|
|
71
73
|
ctx_.reset();
|
|
74
|
+
model_.reset();
|
|
72
75
|
}
|
|
73
76
|
|
|
74
77
|
private:
|
|
78
|
+
LlamaCppModel model_;
|
|
75
79
|
LlamaCppContext ctx_;
|
|
76
80
|
const gpt_params params_;
|
|
77
81
|
std::vector<llama_token> tokens_{};
|
|
@@ -43,11 +43,7 @@ else()
|
|
|
43
43
|
set(LLAMA_METAL_DEFAULT OFF)
|
|
44
44
|
endif()
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
|
48
|
-
else()
|
|
49
|
-
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
|
50
|
-
endif()
|
|
46
|
+
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
|
51
47
|
|
|
52
48
|
# general
|
|
53
49
|
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
|
@@ -107,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
|
|
107
103
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
108
104
|
"llama: max. batch size for using peer access")
|
|
109
105
|
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
|
106
|
+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
|
107
|
+
|
|
110
108
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
|
111
109
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
|
112
110
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
|
@@ -407,12 +405,16 @@ if (LLAMA_CUDA)
|
|
|
407
405
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
|
408
406
|
|
|
409
407
|
add_compile_definitions(GGML_USE_CUDA)
|
|
408
|
+
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
|
410
409
|
if (LLAMA_CUDA_FORCE_DMMV)
|
|
411
410
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
|
412
411
|
endif()
|
|
413
412
|
if (LLAMA_CUDA_FORCE_MMQ)
|
|
414
413
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
|
415
414
|
endif()
|
|
415
|
+
if (LLAMA_CUDA_NO_VMM)
|
|
416
|
+
add_compile_definitions(GGML_CUDA_NO_VMM)
|
|
417
|
+
endif()
|
|
416
418
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
|
417
419
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
|
418
420
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
|
@@ -429,7 +431,7 @@ if (LLAMA_CUDA)
|
|
|
429
431
|
|
|
430
432
|
if (LLAMA_STATIC)
|
|
431
433
|
if (WIN32)
|
|
432
|
-
# As of 12.3.1 CUDA
|
|
434
|
+
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
|
433
435
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
|
434
436
|
else ()
|
|
435
437
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
|
@@ -438,7 +440,11 @@ if (LLAMA_CUDA)
|
|
|
438
440
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
|
439
441
|
endif()
|
|
440
442
|
|
|
441
|
-
|
|
443
|
+
if (LLAMA_CUDA_NO_VMM)
|
|
444
|
+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
|
445
|
+
else()
|
|
446
|
+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
|
447
|
+
endif()
|
|
442
448
|
|
|
443
449
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
|
444
450
|
# 52 == lowest CUDA 12 standard
|