@fugood/llama.node 1.4.9 → 1.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +43 -0
- package/lib/parallel.js +26 -0
- package/lib/parallel.ts +33 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +6 -8
- package/src/LlamaCompletionWorker.cpp +3 -1
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +14 -1
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -4
- package/src/llama.cpp/common/arg.cpp +52 -11
- package/src/llama.cpp/common/arg.h +10 -1
- package/src/llama.cpp/common/common.h +2 -1
- package/src/llama.cpp/common/preset.cpp +197 -5
- package/src/llama.cpp/common/preset.h +45 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +3 -3
package/lib/binding.ts
CHANGED
|
@@ -120,6 +120,8 @@ export type LlamaCompletionOptions = {
|
|
|
120
120
|
tool_choice?: string
|
|
121
121
|
enable_thinking?: boolean
|
|
122
122
|
thinking_forced_open?: boolean
|
|
123
|
+
/** Serialized PEG parser for chat output parsing (required for PEG format types) */
|
|
124
|
+
chat_parser?: string
|
|
123
125
|
prompt?: string
|
|
124
126
|
/**
|
|
125
127
|
* Text to prefill the response with.
|
|
@@ -415,6 +417,8 @@ export type JinjaFormattedChatResult = {
|
|
|
415
417
|
thinking_forced_open: boolean
|
|
416
418
|
preserved_tokens: string[]
|
|
417
419
|
additional_stops: string[]
|
|
420
|
+
/** Serialized PEG parser for chat output parsing (required for PEG format types) */
|
|
421
|
+
chat_parser: string
|
|
418
422
|
}
|
|
419
423
|
|
|
420
424
|
export type Tool = {
|
|
@@ -435,6 +439,24 @@ export type ToolCall = {
|
|
|
435
439
|
id?: string
|
|
436
440
|
}
|
|
437
441
|
|
|
442
|
+
export type ParallelRequestStatus = {
|
|
443
|
+
request_id: number
|
|
444
|
+
type: 'completion' | 'embedding' | 'rerank'
|
|
445
|
+
state: 'queued' | 'processing_prompt' | 'generating' | 'done'
|
|
446
|
+
prompt_length: number
|
|
447
|
+
tokens_generated: number
|
|
448
|
+
prompt_ms: number
|
|
449
|
+
generation_ms: number
|
|
450
|
+
tokens_per_second: number
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
export type ParallelStatus = {
|
|
454
|
+
n_parallel: number
|
|
455
|
+
active_slots: number
|
|
456
|
+
queued_requests: number
|
|
457
|
+
requests: ParallelRequestStatus[]
|
|
458
|
+
}
|
|
459
|
+
|
|
438
460
|
export interface LlamaContext {
|
|
439
461
|
new (
|
|
440
462
|
options: LlamaModelOptions,
|
|
@@ -612,6 +634,27 @@ export interface LlamaContext {
|
|
|
612
634
|
*/
|
|
613
635
|
cancelRequest(requestId: number): void
|
|
614
636
|
|
|
637
|
+
/**
|
|
638
|
+
* Get current parallel processing status (one-time snapshot)
|
|
639
|
+
* @returns Current parallel status
|
|
640
|
+
*/
|
|
641
|
+
getParallelStatus(): ParallelStatus
|
|
642
|
+
|
|
643
|
+
/**
|
|
644
|
+
* Subscribe to parallel processing status changes
|
|
645
|
+
* @param callback Called whenever parallel status changes
|
|
646
|
+
* @returns Subscriber ID that can be used to unsubscribe
|
|
647
|
+
*/
|
|
648
|
+
subscribeParallelStatus(
|
|
649
|
+
callback: (status: ParallelStatus) => void,
|
|
650
|
+
): { subscriberId: number }
|
|
651
|
+
|
|
652
|
+
/**
|
|
653
|
+
* Unsubscribe from parallel processing status changes
|
|
654
|
+
* @param subscriberId Subscriber ID returned from subscribeParallelStatus
|
|
655
|
+
*/
|
|
656
|
+
unsubscribeParallelStatus(subscriberId: number): void
|
|
657
|
+
|
|
615
658
|
/**
|
|
616
659
|
* Clear the KV and recurrent caches.
|
|
617
660
|
* This is faster than recreating the context and useful for preventing
|
package/lib/parallel.js
CHANGED
|
@@ -212,5 +212,31 @@ class LlamaParallelAPI {
|
|
|
212
212
|
isEnabled() {
|
|
213
213
|
return this.enabled;
|
|
214
214
|
}
|
|
215
|
+
/**
|
|
216
|
+
* Get current parallel processing status (one-time snapshot)
|
|
217
|
+
* @returns Current parallel status
|
|
218
|
+
*/
|
|
219
|
+
getStatus() {
|
|
220
|
+
if (!this.enabled) {
|
|
221
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
222
|
+
}
|
|
223
|
+
return this.context.getParallelStatus();
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Subscribe to parallel processing status changes
|
|
227
|
+
* @param callback Called whenever parallel status changes
|
|
228
|
+
* @returns Object with remove() method to unsubscribe
|
|
229
|
+
*/
|
|
230
|
+
subscribeToStatus(callback) {
|
|
231
|
+
if (!this.enabled) {
|
|
232
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
233
|
+
}
|
|
234
|
+
const { subscriberId } = this.context.subscribeParallelStatus(callback);
|
|
235
|
+
return {
|
|
236
|
+
remove: () => {
|
|
237
|
+
this.context.unsubscribeParallelStatus(subscriberId);
|
|
238
|
+
},
|
|
239
|
+
};
|
|
240
|
+
}
|
|
215
241
|
}
|
|
216
242
|
exports.LlamaParallelAPI = LlamaParallelAPI;
|
package/lib/parallel.ts
CHANGED
|
@@ -4,6 +4,7 @@ import type {
|
|
|
4
4
|
LlamaCompletionOptions,
|
|
5
5
|
LlamaCompletionToken,
|
|
6
6
|
RerankParams,
|
|
7
|
+
ParallelStatus,
|
|
7
8
|
} from './binding'
|
|
8
9
|
import { formatMediaChat } from './utils'
|
|
9
10
|
|
|
@@ -278,4 +279,36 @@ export class LlamaParallelAPI {
|
|
|
278
279
|
isEnabled(): boolean {
|
|
279
280
|
return this.enabled
|
|
280
281
|
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Get current parallel processing status (one-time snapshot)
|
|
285
|
+
* @returns Current parallel status
|
|
286
|
+
*/
|
|
287
|
+
getStatus(): ParallelStatus {
|
|
288
|
+
if (!this.enabled) {
|
|
289
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
290
|
+
}
|
|
291
|
+
return this.context.getParallelStatus()
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Subscribe to parallel processing status changes
|
|
296
|
+
* @param callback Called whenever parallel status changes
|
|
297
|
+
* @returns Object with remove() method to unsubscribe
|
|
298
|
+
*/
|
|
299
|
+
subscribeToStatus(
|
|
300
|
+
callback: (status: ParallelStatus) => void,
|
|
301
|
+
): { remove: () => void } {
|
|
302
|
+
if (!this.enabled) {
|
|
303
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const { subscriberId } = this.context.subscribeParallelStatus(callback)
|
|
307
|
+
|
|
308
|
+
return {
|
|
309
|
+
remove: () => {
|
|
310
|
+
this.context.unsubscribeParallelStatus(subscriberId)
|
|
311
|
+
},
|
|
312
|
+
}
|
|
313
|
+
}
|
|
281
314
|
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.10",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.10",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.10",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.10",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.10",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.10",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.10",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.10",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.10",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.10",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.10",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.10",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.10",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.10",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.10"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,25 +1,23 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index f7b99159e..fa37fed19 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -154,8 +154,14 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
9
|
+
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
|
9
10
|
+# Add Windows socket libraries unconditionally on Windows
|
|
10
11
|
+if (WIN32)
|
|
11
12
|
+ set(LLAMA_COMMON_WIN_LIBS ws2_32)
|
|
12
13
|
+else()
|
|
13
14
|
+ set(LLAMA_COMMON_WIN_LIBS "")
|
|
14
15
|
+endif()
|
|
15
|
-
+
|
|
16
|
-
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
|
17
|
-
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
|
18
|
-
-target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
|
19
|
-
+target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
20
16
|
|
|
17
|
+
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
21
18
|
|
|
22
19
|
#
|
|
20
|
+
# copy the license files
|
|
23
21
|
diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
|
|
24
22
|
index 1bcba9cd8..b7cd68734 100644
|
|
25
23
|
--- a/src/llama.cpp/common/chat-peg-parser.cpp
|
|
@@ -110,7 +108,7 @@ index d4e8c7405..af3dec813 100644
|
|
|
110
108
|
mparams.split_mode = params.split_mode;
|
|
111
109
|
mparams.tensor_split = params.tensor_split;
|
|
112
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
113
|
-
index
|
|
111
|
+
index 334372073..e912b593a 100644
|
|
114
112
|
--- a/src/llama.cpp/common/common.h
|
|
115
113
|
+++ b/src/llama.cpp/common/common.h
|
|
116
114
|
@@ -307,6 +307,7 @@ struct lr_opt {
|
|
@@ -37,6 +37,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
37
37
|
int32_t chat_format,
|
|
38
38
|
bool thinking_forced_open,
|
|
39
39
|
std::string reasoning_format,
|
|
40
|
+
const std::string &chat_parser,
|
|
40
41
|
const std::vector<std::string> &media_paths,
|
|
41
42
|
const std::vector<llama_token> &guide_tokens,
|
|
42
43
|
bool has_vocoder,
|
|
@@ -46,6 +47,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
46
47
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
47
48
|
_thinking_forced_open(thinking_forced_open),
|
|
48
49
|
_reasoning_format(reasoning_format),
|
|
50
|
+
_chat_parser(chat_parser),
|
|
49
51
|
_media_paths(media_paths), _guide_tokens(guide_tokens),
|
|
50
52
|
_prefill_text(prefill_text),
|
|
51
53
|
_has_vocoder(has_vocoder), _tts_type(tts_type_val) {
|
|
@@ -121,7 +123,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
121
123
|
}
|
|
122
124
|
|
|
123
125
|
// Begin completion with chat format and reasoning settings
|
|
124
|
-
completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
|
|
126
|
+
completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open, _chat_parser);
|
|
125
127
|
|
|
126
128
|
// Main completion loop
|
|
127
129
|
int token_count = 0;
|
|
@@ -23,6 +23,7 @@ public:
|
|
|
23
23
|
int32_t chat_format,
|
|
24
24
|
bool thinking_forced_open,
|
|
25
25
|
std::string reasoning_format,
|
|
26
|
+
const std::string &chat_parser = "",
|
|
26
27
|
const std::vector<std::string> &media_paths = {},
|
|
27
28
|
const std::vector<llama_token> &guide_tokens = {},
|
|
28
29
|
bool has_vocoder = false,
|
|
@@ -50,6 +51,7 @@ private:
|
|
|
50
51
|
int32_t _chat_format;
|
|
51
52
|
bool _thinking_forced_open;
|
|
52
53
|
std::string _reasoning_format;
|
|
54
|
+
std::string _chat_parser;
|
|
53
55
|
std::vector<std::string> _media_paths;
|
|
54
56
|
std::vector<llama_token> _guide_tokens;
|
|
55
57
|
std::string _prefill_text;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -201,6 +201,15 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
201
201
|
InstanceMethod<&LlamaContext::CancelRequest>(
|
|
202
202
|
"cancelRequest",
|
|
203
203
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
204
|
+
InstanceMethod<&LlamaContext::GetParallelStatus>(
|
|
205
|
+
"getParallelStatus",
|
|
206
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
207
|
+
InstanceMethod<&LlamaContext::SubscribeParallelStatus>(
|
|
208
|
+
"subscribeParallelStatus",
|
|
209
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
210
|
+
InstanceMethod<&LlamaContext::UnsubscribeParallelStatus>(
|
|
211
|
+
"unsubscribeParallelStatus",
|
|
212
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
204
213
|
InstanceMethod<&LlamaContext::ClearCache>(
|
|
205
214
|
"clearCache",
|
|
206
215
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
@@ -762,6 +771,8 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
762
771
|
i, Napi::String::New(env, chatParams.additional_stops[i].c_str()));
|
|
763
772
|
}
|
|
764
773
|
result.Set("additional_stops", additional_stops);
|
|
774
|
+
// chat_parser: string (serialized PEG parser for chat output parsing)
|
|
775
|
+
result.Set("chat_parser", chatParams.parser);
|
|
765
776
|
|
|
766
777
|
return result;
|
|
767
778
|
} else {
|
|
@@ -823,6 +834,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
823
834
|
int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
|
|
824
835
|
bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
|
|
825
836
|
std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
|
|
837
|
+
std::string chat_parser = get_option<std::string>(options, "chat_parser", "");
|
|
826
838
|
|
|
827
839
|
common_params params = _rn_ctx->params;
|
|
828
840
|
auto grammar_from_params = get_option<std::string>(options, "grammar", "");
|
|
@@ -961,6 +973,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
961
973
|
|
|
962
974
|
chat_format = chatParams.format;
|
|
963
975
|
thinking_forced_open = chatParams.thinking_forced_open;
|
|
976
|
+
chat_parser = chatParams.parser;
|
|
964
977
|
|
|
965
978
|
for (const auto &token : chatParams.preserved_tokens) {
|
|
966
979
|
auto ids =
|
|
@@ -1076,7 +1089,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
1076
1089
|
|
|
1077
1090
|
auto *worker =
|
|
1078
1091
|
new LlamaCompletionWorker(info, _rn_ctx, callback, params, stop_words,
|
|
1079
|
-
chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
|
|
1092
|
+
chat_format, thinking_forced_open, reasoning_format, chat_parser, media_paths, guide_tokens,
|
|
1080
1093
|
_rn_ctx->has_vocoder, _rn_ctx->tts_wrapper ? _rn_ctx->tts_wrapper->type : rnllama::UNKNOWN, prefill_text);
|
|
1081
1094
|
worker->Queue();
|
|
1082
1095
|
_wip = worker;
|
package/src/LlamaContext.h
CHANGED
|
@@ -68,6 +68,9 @@ private:
|
|
|
68
68
|
Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
|
|
69
69
|
Napi::Value QueueRerank(const Napi::CallbackInfo &info);
|
|
70
70
|
void CancelRequest(const Napi::CallbackInfo &info);
|
|
71
|
+
Napi::Value GetParallelStatus(const Napi::CallbackInfo &info);
|
|
72
|
+
Napi::Value SubscribeParallelStatus(const Napi::CallbackInfo &info);
|
|
73
|
+
void UnsubscribeParallelStatus(const Napi::CallbackInfo &info);
|
|
71
74
|
|
|
72
75
|
// Cache management
|
|
73
76
|
void ClearCache(const Napi::CallbackInfo &info);
|
|
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
|
|
|
85
85
|
unicode.h
|
|
86
86
|
)
|
|
87
87
|
|
|
88
|
+
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
|
89
|
+
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
|
90
|
+
|
|
88
91
|
if (BUILD_SHARED_LIBS)
|
|
89
92
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
90
93
|
endif()
|
|
@@ -158,10 +161,7 @@ else()
|
|
|
158
161
|
set(LLAMA_COMMON_WIN_LIBS "")
|
|
159
162
|
endif()
|
|
160
163
|
|
|
161
|
-
|
|
162
|
-
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
|
163
|
-
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
164
|
-
|
|
164
|
+
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
165
165
|
|
|
166
166
|
#
|
|
167
167
|
# copy the license files
|
|
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
|
|
|
96
96
|
return *this;
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
+
common_arg & common_arg::set_preset_only() {
|
|
100
|
+
is_preset_only = true;
|
|
101
|
+
return *this;
|
|
102
|
+
}
|
|
103
|
+
|
|
99
104
|
bool common_arg::in_example(enum llama_example ex) {
|
|
100
105
|
return examples.find(ex) != examples.end();
|
|
101
106
|
}
|
|
@@ -772,6 +777,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|
|
772
777
|
}
|
|
773
778
|
auto opt = *arg_to_options[arg];
|
|
774
779
|
std::string val;
|
|
780
|
+
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
|
781
|
+
// bool arg (need to reverse the meaning for negative args)
|
|
782
|
+
bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
|
|
783
|
+
val = is_neg ? "0" : "1";
|
|
784
|
+
}
|
|
775
785
|
if (opt.value_hint != nullptr) {
|
|
776
786
|
// arg with single value
|
|
777
787
|
check_arg(i);
|
|
@@ -1139,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1139
1149
|
}
|
|
1140
1150
|
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1141
1151
|
add_opt(common_arg(
|
|
1142
|
-
{"
|
|
1152
|
+
{"-cram", "--cache-ram"}, "N",
|
|
1143
1153
|
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
|
1144
1154
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
|
1145
1155
|
[](common_params & params, int value) {
|
|
@@ -1147,7 +1157,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1147
1157
|
}
|
|
1148
1158
|
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
1149
1159
|
add_opt(common_arg(
|
|
1150
|
-
{"
|
|
1160
|
+
{"-kvu", "--kv-unified"},
|
|
1151
1161
|
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
|
1152
1162
|
[](common_params & params) {
|
|
1153
1163
|
params.kv_unified = true;
|
|
@@ -1415,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1415
1425
|
}
|
|
1416
1426
|
).set_sparam());
|
|
1417
1427
|
add_opt(common_arg(
|
|
1418
|
-
{"--
|
|
1428
|
+
{"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
|
|
1419
1429
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
|
1420
1430
|
[](common_params & params, const std::string & value) {
|
|
1421
1431
|
params.sampling.samplers = common_sampler_types_from_chars(value);
|
|
@@ -2073,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2073
2083
|
}
|
|
2074
2084
|
));
|
|
2075
2085
|
add_opt(common_arg(
|
|
2076
|
-
{"
|
|
2086
|
+
{"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
|
|
2077
2087
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2078
2088
|
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
|
2079
2089
|
}
|
|
2080
2090
|
));
|
|
2081
2091
|
add_opt(common_arg(
|
|
2082
|
-
{"--override-tensor-draft"
|
|
2092
|
+
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
|
2083
2093
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
2084
2094
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
2085
2095
|
}
|
|
2086
2096
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
2087
2097
|
add_opt(common_arg(
|
|
2088
|
-
{"
|
|
2098
|
+
{"-cmoe", "--cpu-moe"},
|
|
2089
2099
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2090
2100
|
[](common_params & params) {
|
|
2091
2101
|
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2092
2102
|
}
|
|
2093
2103
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2094
2104
|
add_opt(common_arg(
|
|
2095
|
-
{"--n-cpu-moe"
|
|
2105
|
+
{"-ncmoe", "--n-cpu-moe"}, "N",
|
|
2096
2106
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
|
2097
2107
|
[](common_params & params, int value) {
|
|
2098
2108
|
if (value < 0) {
|
|
@@ -2107,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2107
2117
|
}
|
|
2108
2118
|
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
|
2109
2119
|
add_opt(common_arg(
|
|
2110
|
-
{"--cpu-moe-draft"
|
|
2120
|
+
{"-cmoed", "--cpu-moe-draft"},
|
|
2111
2121
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2112
2122
|
[](common_params & params) {
|
|
2113
2123
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2114
2124
|
}
|
|
2115
2125
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2116
2126
|
add_opt(common_arg(
|
|
2117
|
-
{"--n-cpu-moe-draft"
|
|
2127
|
+
{"-ncmoed", "--n-cpu-moe-draft"}, "N",
|
|
2118
2128
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
2119
2129
|
[](common_params & params, int value) {
|
|
2120
2130
|
if (value < 0) {
|
|
@@ -2642,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2642
2652
|
}
|
|
2643
2653
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2644
2654
|
add_opt(common_arg(
|
|
2645
|
-
{"--
|
|
2655
|
+
{"--rerank", "--reranking"},
|
|
2646
2656
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2647
2657
|
[](common_params & params) {
|
|
2648
2658
|
params.embedding = true;
|
|
@@ -2877,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2877
2887
|
params.lora_init_without_apply = true;
|
|
2878
2888
|
}
|
|
2879
2889
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2890
|
+
add_opt(common_arg(
|
|
2891
|
+
{"--sleep-idle-seconds"}, "SECONDS",
|
|
2892
|
+
string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
|
|
2893
|
+
[](common_params & params, int value) {
|
|
2894
|
+
if (value == 0 || value < -1) {
|
|
2895
|
+
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
|
|
2896
|
+
}
|
|
2897
|
+
params.sleep_idle_seconds = value;
|
|
2898
|
+
}
|
|
2899
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2880
2900
|
add_opt(common_arg(
|
|
2881
2901
|
{"--simple-io"},
|
|
2882
2902
|
"use basic IO for better compatibility in subprocesses and limited consoles",
|
|
@@ -3113,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3113
3133
|
}
|
|
3114
3134
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
3115
3135
|
add_opt(common_arg(
|
|
3116
|
-
{"--draft
|
|
3136
|
+
{"--draft", "--draft-n", "--draft-max"}, "N",
|
|
3117
3137
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
|
3118
3138
|
[](common_params & params, int value) {
|
|
3119
3139
|
params.speculative.n_max = value;
|
|
@@ -3489,3 +3509,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3489
3509
|
|
|
3490
3510
|
return ctx_arg;
|
|
3491
3511
|
}
|
|
3512
|
+
|
|
3513
|
+
void common_params_add_preset_options(std::vector<common_arg> & args) {
|
|
3514
|
+
// arguments below won't be treated as CLI args, only preset options
|
|
3515
|
+
args.push_back(common_arg(
|
|
3516
|
+
{"load-on-startup"}, "NAME",
|
|
3517
|
+
"in server router mode, autoload this model on startup",
|
|
3518
|
+
[](common_params &, const std::string &) { /* unused */ }
|
|
3519
|
+
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
|
3520
|
+
|
|
3521
|
+
// args.push_back(common_arg(
|
|
3522
|
+
// {"pin"},
|
|
3523
|
+
// "in server router mode, do not unload this model if models_max is exceeded",
|
|
3524
|
+
// [](common_params &) { /* unused */ }
|
|
3525
|
+
// ).set_preset_only());
|
|
3526
|
+
|
|
3527
|
+
// args.push_back(common_arg(
|
|
3528
|
+
// {"unload-idle-seconds"}, "SECONDS",
|
|
3529
|
+
// "in server router mode, unload models idle for more than this many seconds",
|
|
3530
|
+
// [](common_params &, int) { /* unused */ }
|
|
3531
|
+
// ).set_preset_only());
|
|
3532
|
+
}
|
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
#include <vector>
|
|
9
9
|
#include <cstring>
|
|
10
10
|
|
|
11
|
+
// pseudo-env variable to identify preset-only arguments
|
|
12
|
+
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
|
13
|
+
|
|
11
14
|
//
|
|
12
15
|
// CLI argument parsing
|
|
13
16
|
//
|
|
@@ -22,6 +25,7 @@ struct common_arg {
|
|
|
22
25
|
const char * env = nullptr;
|
|
23
26
|
std::string help;
|
|
24
27
|
bool is_sparam = false; // is current arg a sampling param?
|
|
28
|
+
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
|
25
29
|
void (*handler_void) (common_params & params) = nullptr;
|
|
26
30
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
|
27
31
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
|
@@ -70,6 +74,7 @@ struct common_arg {
|
|
|
70
74
|
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
|
71
75
|
common_arg & set_env(const char * env);
|
|
72
76
|
common_arg & set_sparam();
|
|
77
|
+
common_arg & set_preset_only();
|
|
73
78
|
bool in_example(enum llama_example ex);
|
|
74
79
|
bool is_exclude(enum llama_example ex);
|
|
75
80
|
bool get_value_from_env(std::string & output) const;
|
|
@@ -114,9 +119,13 @@ struct common_params_context {
|
|
|
114
119
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
115
120
|
|
|
116
121
|
// parse input arguments from CLI into a map
|
|
117
|
-
// TODO: support repeated args in the future
|
|
118
122
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
|
119
123
|
|
|
124
|
+
// populate preset-only arguments
|
|
125
|
+
// these arguments are not treated as command line arguments
|
|
126
|
+
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
|
127
|
+
void common_params_add_preset_options(std::vector<common_arg> & args);
|
|
128
|
+
|
|
120
129
|
// initialize argument parser context - used by test-arg-parser and preset
|
|
121
130
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
122
131
|
|
|
@@ -476,7 +476,8 @@ struct common_params {
|
|
|
476
476
|
bool enable_chat_template = true;
|
|
477
477
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
478
478
|
int reasoning_budget = -1;
|
|
479
|
-
bool prefill_assistant = true;
|
|
479
|
+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
480
|
+
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
|
480
481
|
|
|
481
482
|
std::vector<std::string> api_keys;
|
|
482
483
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "preset.h"
|
|
3
3
|
#include "peg-parser.h"
|
|
4
4
|
#include "log.h"
|
|
5
|
+
#include "download.h"
|
|
5
6
|
|
|
6
7
|
#include <fstream>
|
|
7
8
|
#include <sstream>
|
|
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
|
|
|
15
16
|
return str.substr(pos);
|
|
16
17
|
}
|
|
17
18
|
|
|
18
|
-
std::vector<std::string> common_preset::to_args() const {
|
|
19
|
+
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
|
19
20
|
std::vector<std::string> args;
|
|
20
21
|
|
|
22
|
+
if (!bin_path.empty()) {
|
|
23
|
+
args.push_back(bin_path);
|
|
24
|
+
}
|
|
25
|
+
|
|
21
26
|
for (const auto & [opt, value] : options) {
|
|
22
|
-
|
|
27
|
+
if (opt.is_preset_only) {
|
|
28
|
+
continue; // skip preset-only options (they are not CLI args)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// use the last arg as the main arg (i.e. --long-form)
|
|
32
|
+
args.push_back(opt.args.back());
|
|
33
|
+
|
|
34
|
+
// handle value(s)
|
|
23
35
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
|
24
36
|
// flag option, no value
|
|
25
37
|
if (common_arg_utils::is_falsey(value)) {
|
|
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
|
|
|
63
75
|
return ss.str();
|
|
64
76
|
}
|
|
65
77
|
|
|
78
|
+
void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
|
|
79
|
+
// try if option exists, update it
|
|
80
|
+
for (auto & [opt, val] : options) {
|
|
81
|
+
if (opt.env && env == opt.env) {
|
|
82
|
+
val = value;
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// if option does not exist, we need to add it
|
|
87
|
+
if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
|
|
88
|
+
throw std::runtime_error(string_format(
|
|
89
|
+
"%s: option with env '%s' not found in ctx_params",
|
|
90
|
+
__func__, env.c_str()
|
|
91
|
+
));
|
|
92
|
+
}
|
|
93
|
+
options[ctx.key_to_opt.at(env)] = value;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
void common_preset::unset_option(const std::string & env) {
|
|
97
|
+
for (auto it = options.begin(); it != options.end(); ) {
|
|
98
|
+
const common_arg & opt = it->first;
|
|
99
|
+
if (opt.env && env == opt.env) {
|
|
100
|
+
it = options.erase(it);
|
|
101
|
+
return;
|
|
102
|
+
} else {
|
|
103
|
+
++it;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
bool common_preset::get_option(const std::string & env, std::string & value) const {
|
|
109
|
+
for (const auto & [opt, val] : options) {
|
|
110
|
+
if (opt.env && env == opt.env) {
|
|
111
|
+
value = val;
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
void common_preset::merge(const common_preset & other) {
|
|
119
|
+
for (const auto & [opt, val] : other.options) {
|
|
120
|
+
options[opt] = val; // overwrite existing options
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
66
124
|
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
|
67
125
|
std::map<std::string, std::map<std::string, std::string>> parsed;
|
|
68
126
|
|
|
@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
|
|
172
230
|
return value;
|
|
173
231
|
}
|
|
174
232
|
|
|
175
|
-
|
|
233
|
+
common_preset_context::common_preset_context(llama_example ex)
|
|
234
|
+
: ctx_params(common_params_parser_init(default_params, ex)) {
|
|
235
|
+
common_params_add_preset_options(ctx_params.options);
|
|
236
|
+
key_to_opt = get_map_key_opt(ctx_params);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
|
176
240
|
common_presets out;
|
|
177
|
-
auto key_to_opt = get_map_key_opt(ctx_params);
|
|
178
241
|
auto ini_data = parse_ini_from_file(path);
|
|
179
242
|
|
|
180
243
|
for (auto section : ini_data) {
|
|
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
|
|
188
251
|
for (const auto & [key, value] : section.second) {
|
|
189
252
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
|
190
253
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
|
191
|
-
auto & opt = key_to_opt
|
|
254
|
+
const auto & opt = key_to_opt.at(key);
|
|
192
255
|
if (is_bool_arg(opt)) {
|
|
193
256
|
preset.options[opt] = parse_bool_arg(opt, key, value);
|
|
194
257
|
} else {
|
|
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
|
|
199
262
|
// TODO: maybe warn about unknown key?
|
|
200
263
|
}
|
|
201
264
|
}
|
|
265
|
+
|
|
266
|
+
if (preset.name == "*") {
|
|
267
|
+
// handle global preset
|
|
268
|
+
global = preset;
|
|
269
|
+
} else {
|
|
270
|
+
out[preset.name] = preset;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return out;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
common_presets common_preset_context::load_from_cache() const {
|
|
278
|
+
common_presets out;
|
|
279
|
+
|
|
280
|
+
auto cached_models = common_list_cached_models();
|
|
281
|
+
for (const auto & model : cached_models) {
|
|
282
|
+
common_preset preset;
|
|
283
|
+
preset.name = model.to_string();
|
|
284
|
+
preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
|
|
202
285
|
out[preset.name] = preset;
|
|
203
286
|
}
|
|
204
287
|
|
|
205
288
|
return out;
|
|
206
289
|
}
|
|
290
|
+
|
|
291
|
+
struct local_model {
|
|
292
|
+
std::string name;
|
|
293
|
+
std::string path;
|
|
294
|
+
std::string path_mmproj;
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
|
|
298
|
+
if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
|
|
299
|
+
throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
std::vector<local_model> models;
|
|
303
|
+
auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
|
|
304
|
+
auto files = fs_list(subdir_path, false);
|
|
305
|
+
common_file_info model_file;
|
|
306
|
+
common_file_info first_shard_file;
|
|
307
|
+
common_file_info mmproj_file;
|
|
308
|
+
for (const auto & file : files) {
|
|
309
|
+
if (string_ends_with(file.name, ".gguf")) {
|
|
310
|
+
if (file.name.find("mmproj") != std::string::npos) {
|
|
311
|
+
mmproj_file = file;
|
|
312
|
+
} else if (file.name.find("-00001-of-") != std::string::npos) {
|
|
313
|
+
first_shard_file = file;
|
|
314
|
+
} else {
|
|
315
|
+
model_file = file;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
// single file model
|
|
320
|
+
local_model model{
|
|
321
|
+
/* name */ name,
|
|
322
|
+
/* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
|
|
323
|
+
/* path_mmproj */ mmproj_file.path // can be empty
|
|
324
|
+
};
|
|
325
|
+
if (!model.path.empty()) {
|
|
326
|
+
models.push_back(model);
|
|
327
|
+
}
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
auto files = fs_list(models_dir, true);
|
|
331
|
+
for (const auto & file : files) {
|
|
332
|
+
if (file.is_dir) {
|
|
333
|
+
scan_subdir(file.path, file.name);
|
|
334
|
+
} else if (string_ends_with(file.name, ".gguf")) {
|
|
335
|
+
// single file model
|
|
336
|
+
std::string name = file.name;
|
|
337
|
+
string_replace_all(name, ".gguf", "");
|
|
338
|
+
local_model model{
|
|
339
|
+
/* name */ name,
|
|
340
|
+
/* path */ file.path,
|
|
341
|
+
/* path_mmproj */ ""
|
|
342
|
+
};
|
|
343
|
+
models.push_back(model);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// convert local models to presets
|
|
348
|
+
common_presets out;
|
|
349
|
+
for (const auto & model : models) {
|
|
350
|
+
common_preset preset;
|
|
351
|
+
preset.name = model.name;
|
|
352
|
+
preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
|
|
353
|
+
if (!model.path_mmproj.empty()) {
|
|
354
|
+
preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
|
|
355
|
+
}
|
|
356
|
+
out[preset.name] = preset;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return out;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
|
|
363
|
+
common_preset preset;
|
|
364
|
+
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
|
365
|
+
|
|
366
|
+
bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
|
|
367
|
+
if (!ok) {
|
|
368
|
+
throw std::runtime_error("failed to parse CLI arguments into preset");
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return preset;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
|
|
375
|
+
common_presets out = base; // copy
|
|
376
|
+
for (const auto & [name, preset_added] : added) {
|
|
377
|
+
if (out.find(name) != out.end()) {
|
|
378
|
+
// if exists, merge
|
|
379
|
+
common_preset & target = out[name];
|
|
380
|
+
target.merge(preset_added);
|
|
381
|
+
} else {
|
|
382
|
+
// otherwise, add directly
|
|
383
|
+
out[name] = preset_added;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
return out;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
|
|
390
|
+
common_presets out;
|
|
391
|
+
for (const auto & [name, preset] : presets) {
|
|
392
|
+
common_preset tmp = base; // copy
|
|
393
|
+
tmp.name = name;
|
|
394
|
+
tmp.merge(preset);
|
|
395
|
+
out[name] = std::move(tmp);
|
|
396
|
+
}
|
|
397
|
+
return out;
|
|
398
|
+
}
|
|
@@ -13,20 +13,62 @@
|
|
|
13
13
|
|
|
14
14
|
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
|
15
15
|
|
|
16
|
+
struct common_preset_context;
|
|
17
|
+
|
|
16
18
|
struct common_preset {
|
|
17
19
|
std::string name;
|
|
18
|
-
|
|
20
|
+
|
|
21
|
+
// options are stored as common_arg to string mapping, representing CLI arg and its value
|
|
19
22
|
std::map<common_arg, std::string> options;
|
|
20
23
|
|
|
21
24
|
// convert preset to CLI argument list
|
|
22
|
-
std::vector<std::string> to_args() const;
|
|
25
|
+
std::vector<std::string> to_args(const std::string & bin_path = "") const;
|
|
23
26
|
|
|
24
27
|
// convert preset to INI format string
|
|
25
28
|
std::string to_ini() const;
|
|
26
29
|
|
|
27
30
|
// TODO: maybe implement to_env() if needed
|
|
31
|
+
|
|
32
|
+
// modify preset options where argument is identified by its env variable
|
|
33
|
+
void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
|
|
34
|
+
|
|
35
|
+
// unset option by its env variable
|
|
36
|
+
void unset_option(const std::string & env);
|
|
37
|
+
|
|
38
|
+
// get option value by its env variable, return false if not found
|
|
39
|
+
bool get_option(const std::string & env, std::string & value) const;
|
|
40
|
+
|
|
41
|
+
// merge another preset into this one, overwriting existing options
|
|
42
|
+
void merge(const common_preset & other);
|
|
28
43
|
};
|
|
29
44
|
|
|
30
45
|
// interface for multiple presets in one file
|
|
31
46
|
using common_presets = std::map<std::string, common_preset>;
|
|
32
|
-
|
|
47
|
+
|
|
48
|
+
// context for loading and editing presets
|
|
49
|
+
struct common_preset_context {
|
|
50
|
+
common_params default_params; // unused for now
|
|
51
|
+
common_params_context ctx_params;
|
|
52
|
+
std::map<std::string, common_arg> key_to_opt;
|
|
53
|
+
common_preset_context(llama_example ex);
|
|
54
|
+
|
|
55
|
+
// load presets from INI file
|
|
56
|
+
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
|
57
|
+
|
|
58
|
+
// generate presets from cached models
|
|
59
|
+
common_presets load_from_cache() const;
|
|
60
|
+
|
|
61
|
+
// generate presets from local models directory
|
|
62
|
+
// for the directory structure, see "Using multiple models" in server/README.md
|
|
63
|
+
common_presets load_from_models_dir(const std::string & models_dir) const;
|
|
64
|
+
|
|
65
|
+
// generate one preset from CLI arguments
|
|
66
|
+
common_preset load_from_args(int argc, char ** argv) const;
|
|
67
|
+
|
|
68
|
+
// cascade multiple presets if exist on both: base < added
|
|
69
|
+
// if preset does not exist in base, it will be added without modification
|
|
70
|
+
common_presets cascade(const common_presets & base, const common_presets & added) const;
|
|
71
|
+
|
|
72
|
+
// apply presets over a base preset (same idea as CSS cascading)
|
|
73
|
+
common_presets cascade(const common_preset & base, const common_presets & presets) const;
|
|
74
|
+
};
|
|
@@ -254,6 +254,7 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
|
|
254
254
|
"gmml: OpenCL API version to target")
|
|
255
255
|
|
|
256
256
|
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
|
257
|
+
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
|
257
258
|
|
|
258
259
|
# toolchain for vulkan-shaders-gen
|
|
259
260
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
@@ -1086,10 +1086,10 @@ bool llama_model_loader::load_all_data(
|
|
|
1086
1086
|
} else {
|
|
1087
1087
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
1088
1088
|
if (upload_backend) {
|
|
1089
|
-
|
|
1089
|
+
size_t offset = weight->offs;
|
|
1090
1090
|
alignment = file->read_alignment();
|
|
1091
|
-
|
|
1092
|
-
|
|
1091
|
+
size_t aligned_offset = offset & ~(alignment - 1);
|
|
1092
|
+
size_t offset_from_alignment = offset - aligned_offset;
|
|
1093
1093
|
file->seek(aligned_offset, SEEK_SET);
|
|
1094
1094
|
|
|
1095
1095
|
// Calculate aligned read boundaries
|