@fugood/llama.node 1.4.9 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -120,6 +120,8 @@ export type LlamaCompletionOptions = {
120
120
  tool_choice?: string
121
121
  enable_thinking?: boolean
122
122
  thinking_forced_open?: boolean
123
+ /** Serialized PEG parser for chat output parsing (required for PEG format types) */
124
+ chat_parser?: string
123
125
  prompt?: string
124
126
  /**
125
127
  * Text to prefill the response with.
@@ -415,6 +417,8 @@ export type JinjaFormattedChatResult = {
415
417
  thinking_forced_open: boolean
416
418
  preserved_tokens: string[]
417
419
  additional_stops: string[]
420
+ /** Serialized PEG parser for chat output parsing (required for PEG format types) */
421
+ chat_parser: string
418
422
  }
419
423
 
420
424
  export type Tool = {
@@ -435,6 +439,24 @@ export type ToolCall = {
435
439
  id?: string
436
440
  }
437
441
 
442
+ export type ParallelRequestStatus = {
443
+ request_id: number
444
+ type: 'completion' | 'embedding' | 'rerank'
445
+ state: 'queued' | 'processing_prompt' | 'generating' | 'done'
446
+ prompt_length: number
447
+ tokens_generated: number
448
+ prompt_ms: number
449
+ generation_ms: number
450
+ tokens_per_second: number
451
+ }
452
+
453
+ export type ParallelStatus = {
454
+ n_parallel: number
455
+ active_slots: number
456
+ queued_requests: number
457
+ requests: ParallelRequestStatus[]
458
+ }
459
+
438
460
  export interface LlamaContext {
439
461
  new (
440
462
  options: LlamaModelOptions,
@@ -612,6 +634,27 @@ export interface LlamaContext {
612
634
  */
613
635
  cancelRequest(requestId: number): void
614
636
 
637
+ /**
638
+ * Get current parallel processing status (one-time snapshot)
639
+ * @returns Current parallel status
640
+ */
641
+ getParallelStatus(): ParallelStatus
642
+
643
+ /**
644
+ * Subscribe to parallel processing status changes
645
+ * @param callback Called whenever parallel status changes
646
+ * @returns Subscriber ID that can be used to unsubscribe
647
+ */
648
+ subscribeParallelStatus(
649
+ callback: (status: ParallelStatus) => void,
650
+ ): { subscriberId: number }
651
+
652
+ /**
653
+ * Unsubscribe from parallel processing status changes
654
+ * @param subscriberId Subscriber ID returned from subscribeParallelStatus
655
+ */
656
+ unsubscribeParallelStatus(subscriberId: number): void
657
+
615
658
  /**
616
659
  * Clear the KV and recurrent caches.
617
660
  * This is faster than recreating the context and useful for preventing
package/lib/parallel.js CHANGED
@@ -212,5 +212,31 @@ class LlamaParallelAPI {
212
212
  isEnabled() {
213
213
  return this.enabled;
214
214
  }
215
+ /**
216
+ * Get current parallel processing status (one-time snapshot)
217
+ * @returns Current parallel status
218
+ */
219
+ getStatus() {
220
+ if (!this.enabled) {
221
+ throw new Error('Parallel mode is not enabled. Call enable() first.');
222
+ }
223
+ return this.context.getParallelStatus();
224
+ }
225
+ /**
226
+ * Subscribe to parallel processing status changes
227
+ * @param callback Called whenever parallel status changes
228
+ * @returns Object with remove() method to unsubscribe
229
+ */
230
+ subscribeToStatus(callback) {
231
+ if (!this.enabled) {
232
+ throw new Error('Parallel mode is not enabled. Call enable() first.');
233
+ }
234
+ const { subscriberId } = this.context.subscribeParallelStatus(callback);
235
+ return {
236
+ remove: () => {
237
+ this.context.unsubscribeParallelStatus(subscriberId);
238
+ },
239
+ };
240
+ }
215
241
  }
216
242
  exports.LlamaParallelAPI = LlamaParallelAPI;
package/lib/parallel.ts CHANGED
@@ -4,6 +4,7 @@ import type {
4
4
  LlamaCompletionOptions,
5
5
  LlamaCompletionToken,
6
6
  RerankParams,
7
+ ParallelStatus,
7
8
  } from './binding'
8
9
  import { formatMediaChat } from './utils'
9
10
 
@@ -278,4 +279,36 @@ export class LlamaParallelAPI {
278
279
  isEnabled(): boolean {
279
280
  return this.enabled
280
281
  }
282
+
283
+ /**
284
+ * Get current parallel processing status (one-time snapshot)
285
+ * @returns Current parallel status
286
+ */
287
+ getStatus(): ParallelStatus {
288
+ if (!this.enabled) {
289
+ throw new Error('Parallel mode is not enabled. Call enable() first.')
290
+ }
291
+ return this.context.getParallelStatus()
292
+ }
293
+
294
+ /**
295
+ * Subscribe to parallel processing status changes
296
+ * @param callback Called whenever parallel status changes
297
+ * @returns Object with remove() method to unsubscribe
298
+ */
299
+ subscribeToStatus(
300
+ callback: (status: ParallelStatus) => void,
301
+ ): { remove: () => void } {
302
+ if (!this.enabled) {
303
+ throw new Error('Parallel mode is not enabled. Call enable() first.')
304
+ }
305
+
306
+ const { subscriberId } = this.context.subscribeParallelStatus(callback)
307
+
308
+ return {
309
+ remove: () => {
310
+ this.context.unsubscribeParallelStatus(subscriberId)
311
+ },
312
+ }
313
+ }
281
314
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.9",
4
+ "version": "1.4.10",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.9",
76
- "@fugood/node-llama-darwin-x64": "1.4.9",
77
- "@fugood/node-llama-linux-arm64": "1.4.9",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.9",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.9",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.9",
81
- "@fugood/node-llama-linux-x64": "1.4.9",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.9",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.9",
84
- "@fugood/node-llama-win32-arm64": "1.4.9",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.9",
86
- "@fugood/node-llama-win32-x64": "1.4.9",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.9",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.9"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.10",
76
+ "@fugood/node-llama-darwin-x64": "1.4.10",
77
+ "@fugood/node-llama-linux-arm64": "1.4.10",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.10",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.10",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.10",
81
+ "@fugood/node-llama-linux-x64": "1.4.10",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.10",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.10",
84
+ "@fugood/node-llama-win32-arm64": "1.4.10",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.10",
86
+ "@fugood/node-llama-win32-x64": "1.4.10",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.10",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.10"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -1,25 +1,23 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index 0182767c2..f8c4a4f63 100644
2
+ index f7b99159e..fa37fed19 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -151,9 +151,16 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -154,8 +154,14 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
9
+ -target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
9
10
  +# Add Windows socket libraries unconditionally on Windows
10
11
  +if (WIN32)
11
12
  + set(LLAMA_COMMON_WIN_LIBS ws2_32)
12
13
  +else()
13
14
  + set(LLAMA_COMMON_WIN_LIBS "")
14
15
  +endif()
15
- +
16
- target_include_directories(${TARGET} PUBLIC . ../vendor)
17
- target_compile_features (${TARGET} PUBLIC cxx_std_17)
18
- -target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
19
- +target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
20
16
 
17
+ +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
21
18
 
22
19
  #
20
+ # copy the license files
23
21
  diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
24
22
  index 1bcba9cd8..b7cd68734 100644
25
23
  --- a/src/llama.cpp/common/chat-peg-parser.cpp
@@ -110,7 +108,7 @@ index d4e8c7405..af3dec813 100644
110
108
  mparams.split_mode = params.split_mode;
111
109
  mparams.tensor_split = params.tensor_split;
112
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
113
- index 3e314f4c8..5750a4057 100644
111
+ index 334372073..e912b593a 100644
114
112
  --- a/src/llama.cpp/common/common.h
115
113
  +++ b/src/llama.cpp/common/common.h
116
114
  @@ -307,6 +307,7 @@ struct lr_opt {
@@ -37,6 +37,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
37
37
  int32_t chat_format,
38
38
  bool thinking_forced_open,
39
39
  std::string reasoning_format,
40
+ const std::string &chat_parser,
40
41
  const std::vector<std::string> &media_paths,
41
42
  const std::vector<llama_token> &guide_tokens,
42
43
  bool has_vocoder,
@@ -46,6 +47,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
46
47
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
47
48
  _thinking_forced_open(thinking_forced_open),
48
49
  _reasoning_format(reasoning_format),
50
+ _chat_parser(chat_parser),
49
51
  _media_paths(media_paths), _guide_tokens(guide_tokens),
50
52
  _prefill_text(prefill_text),
51
53
  _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
@@ -121,7 +123,7 @@ void LlamaCompletionWorker::Execute() {
121
123
  }
122
124
 
123
125
  // Begin completion with chat format and reasoning settings
124
- completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
126
+ completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open, _chat_parser);
125
127
 
126
128
  // Main completion loop
127
129
  int token_count = 0;
@@ -23,6 +23,7 @@ public:
23
23
  int32_t chat_format,
24
24
  bool thinking_forced_open,
25
25
  std::string reasoning_format,
26
+ const std::string &chat_parser = "",
26
27
  const std::vector<std::string> &media_paths = {},
27
28
  const std::vector<llama_token> &guide_tokens = {},
28
29
  bool has_vocoder = false,
@@ -50,6 +51,7 @@ private:
50
51
  int32_t _chat_format;
51
52
  bool _thinking_forced_open;
52
53
  std::string _reasoning_format;
54
+ std::string _chat_parser;
53
55
  std::vector<std::string> _media_paths;
54
56
  std::vector<llama_token> _guide_tokens;
55
57
  std::string _prefill_text;
@@ -201,6 +201,15 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
201
201
  InstanceMethod<&LlamaContext::CancelRequest>(
202
202
  "cancelRequest",
203
203
  static_cast<napi_property_attributes>(napi_enumerable)),
204
+ InstanceMethod<&LlamaContext::GetParallelStatus>(
205
+ "getParallelStatus",
206
+ static_cast<napi_property_attributes>(napi_enumerable)),
207
+ InstanceMethod<&LlamaContext::SubscribeParallelStatus>(
208
+ "subscribeParallelStatus",
209
+ static_cast<napi_property_attributes>(napi_enumerable)),
210
+ InstanceMethod<&LlamaContext::UnsubscribeParallelStatus>(
211
+ "unsubscribeParallelStatus",
212
+ static_cast<napi_property_attributes>(napi_enumerable)),
204
213
  InstanceMethod<&LlamaContext::ClearCache>(
205
214
  "clearCache",
206
215
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -762,6 +771,8 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
762
771
  i, Napi::String::New(env, chatParams.additional_stops[i].c_str()));
763
772
  }
764
773
  result.Set("additional_stops", additional_stops);
774
+ // chat_parser: string (serialized PEG parser for chat output parsing)
775
+ result.Set("chat_parser", chatParams.parser);
765
776
 
766
777
  return result;
767
778
  } else {
@@ -823,6 +834,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
823
834
  int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
824
835
  bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
825
836
  std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
837
+ std::string chat_parser = get_option<std::string>(options, "chat_parser", "");
826
838
 
827
839
  common_params params = _rn_ctx->params;
828
840
  auto grammar_from_params = get_option<std::string>(options, "grammar", "");
@@ -961,6 +973,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
961
973
 
962
974
  chat_format = chatParams.format;
963
975
  thinking_forced_open = chatParams.thinking_forced_open;
976
+ chat_parser = chatParams.parser;
964
977
 
965
978
  for (const auto &token : chatParams.preserved_tokens) {
966
979
  auto ids =
@@ -1076,7 +1089,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
1076
1089
 
1077
1090
  auto *worker =
1078
1091
  new LlamaCompletionWorker(info, _rn_ctx, callback, params, stop_words,
1079
- chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
1092
+ chat_format, thinking_forced_open, reasoning_format, chat_parser, media_paths, guide_tokens,
1080
1093
  _rn_ctx->has_vocoder, _rn_ctx->tts_wrapper ? _rn_ctx->tts_wrapper->type : rnllama::UNKNOWN, prefill_text);
1081
1094
  worker->Queue();
1082
1095
  _wip = worker;
@@ -68,6 +68,9 @@ private:
68
68
  Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
69
69
  Napi::Value QueueRerank(const Napi::CallbackInfo &info);
70
70
  void CancelRequest(const Napi::CallbackInfo &info);
71
+ Napi::Value GetParallelStatus(const Napi::CallbackInfo &info);
72
+ Napi::Value SubscribeParallelStatus(const Napi::CallbackInfo &info);
73
+ void UnsubscribeParallelStatus(const Napi::CallbackInfo &info);
71
74
 
72
75
  // Cache management
73
76
  void ClearCache(const Napi::CallbackInfo &info);
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
85
85
  unicode.h
86
86
  )
87
87
 
88
+ target_include_directories(${TARGET} PUBLIC . ../vendor)
89
+ target_compile_features (${TARGET} PUBLIC cxx_std_17)
90
+
88
91
  if (BUILD_SHARED_LIBS)
89
92
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
90
93
  endif()
@@ -158,10 +161,7 @@ else()
158
161
  set(LLAMA_COMMON_WIN_LIBS "")
159
162
  endif()
160
163
 
161
- target_include_directories(${TARGET} PUBLIC . ../vendor)
162
- target_compile_features (${TARGET} PUBLIC cxx_std_17)
163
- target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
164
-
164
+ target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
165
165
 
166
166
  #
167
167
  # copy the license files
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
96
96
  return *this;
97
97
  }
98
98
 
99
+ common_arg & common_arg::set_preset_only() {
100
+ is_preset_only = true;
101
+ return *this;
102
+ }
103
+
99
104
  bool common_arg::in_example(enum llama_example ex) {
100
105
  return examples.find(ex) != examples.end();
101
106
  }
@@ -772,6 +777,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
772
777
  }
773
778
  auto opt = *arg_to_options[arg];
774
779
  std::string val;
780
+ if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
781
+ // bool arg (need to reverse the meaning for negative args)
782
+ bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
783
+ val = is_neg ? "0" : "1";
784
+ }
775
785
  if (opt.value_hint != nullptr) {
776
786
  // arg with single value
777
787
  check_arg(i);
@@ -1139,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1139
1149
  }
1140
1150
  ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1141
1151
  add_opt(common_arg(
1142
- {"--cache-ram", "-cram"}, "N",
1152
+ {"-cram", "--cache-ram"}, "N",
1143
1153
  string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
1144
1154
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1145
1155
  [](common_params & params, int value) {
@@ -1147,7 +1157,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1147
1157
  }
1148
1158
  ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1149
1159
  add_opt(common_arg(
1150
- {"--kv-unified", "-kvu"},
1160
+ {"-kvu", "--kv-unified"},
1151
1161
  "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1152
1162
  [](common_params & params) {
1153
1163
  params.kv_unified = true;
@@ -1415,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1415
1425
  }
1416
1426
  ).set_sparam());
1417
1427
  add_opt(common_arg(
1418
- {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
1428
+ {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
1419
1429
  string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1420
1430
  [](common_params & params, const std::string & value) {
1421
1431
  params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -2073,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2073
2083
  }
2074
2084
  ));
2075
2085
  add_opt(common_arg(
2076
- {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2086
+ {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
2077
2087
  "override tensor buffer type", [](common_params & params, const std::string & value) {
2078
2088
  parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2079
2089
  }
2080
2090
  ));
2081
2091
  add_opt(common_arg(
2082
- {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
2092
+ {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2083
2093
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2084
2094
  parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2085
2095
  }
2086
2096
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2087
2097
  add_opt(common_arg(
2088
- {"--cpu-moe", "-cmoe"},
2098
+ {"-cmoe", "--cpu-moe"},
2089
2099
  "keep all Mixture of Experts (MoE) weights in the CPU",
2090
2100
  [](common_params & params) {
2091
2101
  params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2092
2102
  }
2093
2103
  ).set_env("LLAMA_ARG_CPU_MOE"));
2094
2104
  add_opt(common_arg(
2095
- {"--n-cpu-moe", "-ncmoe"}, "N",
2105
+ {"-ncmoe", "--n-cpu-moe"}, "N",
2096
2106
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2097
2107
  [](common_params & params, int value) {
2098
2108
  if (value < 0) {
@@ -2107,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2107
2117
  }
2108
2118
  ).set_env("LLAMA_ARG_N_CPU_MOE"));
2109
2119
  add_opt(common_arg(
2110
- {"--cpu-moe-draft", "-cmoed"},
2120
+ {"-cmoed", "--cpu-moe-draft"},
2111
2121
  "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2112
2122
  [](common_params & params) {
2113
2123
  params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2114
2124
  }
2115
2125
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2116
2126
  add_opt(common_arg(
2117
- {"--n-cpu-moe-draft", "-ncmoed"}, "N",
2127
+ {"-ncmoed", "--n-cpu-moe-draft"}, "N",
2118
2128
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2119
2129
  [](common_params & params, int value) {
2120
2130
  if (value < 0) {
@@ -2642,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2642
2652
  }
2643
2653
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2644
2654
  add_opt(common_arg(
2645
- {"--reranking", "--rerank"},
2655
+ {"--rerank", "--reranking"},
2646
2656
  string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2647
2657
  [](common_params & params) {
2648
2658
  params.embedding = true;
@@ -2877,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2877
2887
  params.lora_init_without_apply = true;
2878
2888
  }
2879
2889
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
2890
+ add_opt(common_arg(
2891
+ {"--sleep-idle-seconds"}, "SECONDS",
2892
+ string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
2893
+ [](common_params & params, int value) {
2894
+ if (value == 0 || value < -1) {
2895
+ throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
2896
+ }
2897
+ params.sleep_idle_seconds = value;
2898
+ }
2899
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2880
2900
  add_opt(common_arg(
2881
2901
  {"--simple-io"},
2882
2902
  "use basic IO for better compatibility in subprocesses and limited consoles",
@@ -3113,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3113
3133
  }
3114
3134
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3115
3135
  add_opt(common_arg(
3116
- {"--draft-max", "--draft", "--draft-n"}, "N",
3136
+ {"--draft", "--draft-n", "--draft-max"}, "N",
3117
3137
  string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
3118
3138
  [](common_params & params, int value) {
3119
3139
  params.speculative.n_max = value;
@@ -3489,3 +3509,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3489
3509
 
3490
3510
  return ctx_arg;
3491
3511
  }
3512
+
3513
+ void common_params_add_preset_options(std::vector<common_arg> & args) {
3514
+ // arguments below won't be treated as CLI args, only preset options
3515
+ args.push_back(common_arg(
3516
+ {"load-on-startup"}, "NAME",
3517
+ "in server router mode, autoload this model on startup",
3518
+ [](common_params &, const std::string &) { /* unused */ }
3519
+ ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3520
+
3521
+ // args.push_back(common_arg(
3522
+ // {"pin"},
3523
+ // "in server router mode, do not unload this model if models_max is exceeded",
3524
+ // [](common_params &) { /* unused */ }
3525
+ // ).set_preset_only());
3526
+
3527
+ // args.push_back(common_arg(
3528
+ // {"unload-idle-seconds"}, "SECONDS",
3529
+ // "in server router mode, unload models idle for more than this many seconds",
3530
+ // [](common_params &, int) { /* unused */ }
3531
+ // ).set_preset_only());
3532
+ }
@@ -8,6 +8,9 @@
8
8
  #include <vector>
9
9
  #include <cstring>
10
10
 
11
+ // pseudo-env variable to identify preset-only arguments
12
+ #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13
+
11
14
  //
12
15
  // CLI argument parsing
13
16
  //
@@ -22,6 +25,7 @@ struct common_arg {
22
25
  const char * env = nullptr;
23
26
  std::string help;
24
27
  bool is_sparam = false; // is current arg a sampling param?
28
+ bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
25
29
  void (*handler_void) (common_params & params) = nullptr;
26
30
  void (*handler_string) (common_params & params, const std::string &) = nullptr;
27
31
  void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +74,7 @@ struct common_arg {
70
74
  common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
71
75
  common_arg & set_env(const char * env);
72
76
  common_arg & set_sparam();
77
+ common_arg & set_preset_only();
73
78
  bool in_example(enum llama_example ex);
74
79
  bool is_exclude(enum llama_example ex);
75
80
  bool get_value_from_env(std::string & output) const;
@@ -114,9 +119,13 @@ struct common_params_context {
114
119
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
115
120
 
116
121
  // parse input arguments from CLI into a map
117
- // TODO: support repeated args in the future
118
122
  bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
119
123
 
124
+ // populate preset-only arguments
125
+ // these arguments are not treated as command line arguments
126
+ // see: https://github.com/ggml-org/llama.cpp/issues/18163
127
+ void common_params_add_preset_options(std::vector<common_arg> & args);
128
+
120
129
  // initialize argument parser context - used by test-arg-parser and preset
121
130
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
122
131
 
@@ -476,7 +476,8 @@ struct common_params {
476
476
  bool enable_chat_template = true;
477
477
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
478
478
  int reasoning_budget = -1;
479
- bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
479
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
480
+ int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
480
481
 
481
482
  std::vector<std::string> api_keys;
482
483
 
@@ -2,6 +2,7 @@
2
2
  #include "preset.h"
3
3
  #include "peg-parser.h"
4
4
  #include "log.h"
5
+ #include "download.h"
5
6
 
6
7
  #include <fstream>
7
8
  #include <sstream>
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
15
16
  return str.substr(pos);
16
17
  }
17
18
 
18
- std::vector<std::string> common_preset::to_args() const {
19
+ std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
19
20
  std::vector<std::string> args;
20
21
 
22
+ if (!bin_path.empty()) {
23
+ args.push_back(bin_path);
24
+ }
25
+
21
26
  for (const auto & [opt, value] : options) {
22
- args.push_back(opt.args.back()); // use the last arg as the main arg
27
+ if (opt.is_preset_only) {
28
+ continue; // skip preset-only options (they are not CLI args)
29
+ }
30
+
31
+ // use the last arg as the main arg (i.e. --long-form)
32
+ args.push_back(opt.args.back());
33
+
34
+ // handle value(s)
23
35
  if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
24
36
  // flag option, no value
25
37
  if (common_arg_utils::is_falsey(value)) {
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
63
75
  return ss.str();
64
76
  }
65
77
 
78
+ void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
79
+ // try if option exists, update it
80
+ for (auto & [opt, val] : options) {
81
+ if (opt.env && env == opt.env) {
82
+ val = value;
83
+ return;
84
+ }
85
+ }
86
+ // if option does not exist, we need to add it
87
+ if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
88
+ throw std::runtime_error(string_format(
89
+ "%s: option with env '%s' not found in ctx_params",
90
+ __func__, env.c_str()
91
+ ));
92
+ }
93
+ options[ctx.key_to_opt.at(env)] = value;
94
+ }
95
+
96
+ void common_preset::unset_option(const std::string & env) {
97
+ for (auto it = options.begin(); it != options.end(); ) {
98
+ const common_arg & opt = it->first;
99
+ if (opt.env && env == opt.env) {
100
+ it = options.erase(it);
101
+ return;
102
+ } else {
103
+ ++it;
104
+ }
105
+ }
106
+ }
107
+
108
+ bool common_preset::get_option(const std::string & env, std::string & value) const {
109
+ for (const auto & [opt, val] : options) {
110
+ if (opt.env && env == opt.env) {
111
+ value = val;
112
+ return true;
113
+ }
114
+ }
115
+ return false;
116
+ }
117
+
118
+ void common_preset::merge(const common_preset & other) {
119
+ for (const auto & [opt, val] : other.options) {
120
+ options[opt] = val; // overwrite existing options
121
+ }
122
+ }
123
+
66
124
  static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
67
125
  std::map<std::string, std::map<std::string, std::string>> parsed;
68
126
 
@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
172
230
  return value;
173
231
  }
174
232
 
175
- common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
233
+ common_preset_context::common_preset_context(llama_example ex)
234
+ : ctx_params(common_params_parser_init(default_params, ex)) {
235
+ common_params_add_preset_options(ctx_params.options);
236
+ key_to_opt = get_map_key_opt(ctx_params);
237
+ }
238
+
239
+ common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
176
240
  common_presets out;
177
- auto key_to_opt = get_map_key_opt(ctx_params);
178
241
  auto ini_data = parse_ini_from_file(path);
179
242
 
180
243
  for (auto section : ini_data) {
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
188
251
  for (const auto & [key, value] : section.second) {
189
252
  LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
190
253
  if (key_to_opt.find(key) != key_to_opt.end()) {
191
- auto & opt = key_to_opt[key];
254
+ const auto & opt = key_to_opt.at(key);
192
255
  if (is_bool_arg(opt)) {
193
256
  preset.options[opt] = parse_bool_arg(opt, key, value);
194
257
  } else {
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
199
262
  // TODO: maybe warn about unknown key?
200
263
  }
201
264
  }
265
+
266
+ if (preset.name == "*") {
267
+ // handle global preset
268
+ global = preset;
269
+ } else {
270
+ out[preset.name] = preset;
271
+ }
272
+ }
273
+
274
+ return out;
275
+ }
276
+
277
+ common_presets common_preset_context::load_from_cache() const {
278
+ common_presets out;
279
+
280
+ auto cached_models = common_list_cached_models();
281
+ for (const auto & model : cached_models) {
282
+ common_preset preset;
283
+ preset.name = model.to_string();
284
+ preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
202
285
  out[preset.name] = preset;
203
286
  }
204
287
 
205
288
  return out;
206
289
  }
290
+
291
+ struct local_model {
292
+ std::string name;
293
+ std::string path;
294
+ std::string path_mmproj;
295
+ };
296
+
297
+ common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
298
+ if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
299
+ throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
300
+ }
301
+
302
+ std::vector<local_model> models;
303
+ auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
304
+ auto files = fs_list(subdir_path, false);
305
+ common_file_info model_file;
306
+ common_file_info first_shard_file;
307
+ common_file_info mmproj_file;
308
+ for (const auto & file : files) {
309
+ if (string_ends_with(file.name, ".gguf")) {
310
+ if (file.name.find("mmproj") != std::string::npos) {
311
+ mmproj_file = file;
312
+ } else if (file.name.find("-00001-of-") != std::string::npos) {
313
+ first_shard_file = file;
314
+ } else {
315
+ model_file = file;
316
+ }
317
+ }
318
+ }
319
+ // single file model
320
+ local_model model{
321
+ /* name */ name,
322
+ /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
323
+ /* path_mmproj */ mmproj_file.path // can be empty
324
+ };
325
+ if (!model.path.empty()) {
326
+ models.push_back(model);
327
+ }
328
+ };
329
+
330
+ auto files = fs_list(models_dir, true);
331
+ for (const auto & file : files) {
332
+ if (file.is_dir) {
333
+ scan_subdir(file.path, file.name);
334
+ } else if (string_ends_with(file.name, ".gguf")) {
335
+ // single file model
336
+ std::string name = file.name;
337
+ string_replace_all(name, ".gguf", "");
338
+ local_model model{
339
+ /* name */ name,
340
+ /* path */ file.path,
341
+ /* path_mmproj */ ""
342
+ };
343
+ models.push_back(model);
344
+ }
345
+ }
346
+
347
+ // convert local models to presets
348
+ common_presets out;
349
+ for (const auto & model : models) {
350
+ common_preset preset;
351
+ preset.name = model.name;
352
+ preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
353
+ if (!model.path_mmproj.empty()) {
354
+ preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
355
+ }
356
+ out[preset.name] = preset;
357
+ }
358
+
359
+ return out;
360
+ }
361
+
362
+ common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
363
+ common_preset preset;
364
+ preset.name = COMMON_PRESET_DEFAULT_NAME;
365
+
366
+ bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
367
+ if (!ok) {
368
+ throw std::runtime_error("failed to parse CLI arguments into preset");
369
+ }
370
+
371
+ return preset;
372
+ }
373
+
374
+ common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
375
+ common_presets out = base; // copy
376
+ for (const auto & [name, preset_added] : added) {
377
+ if (out.find(name) != out.end()) {
378
+ // if exists, merge
379
+ common_preset & target = out[name];
380
+ target.merge(preset_added);
381
+ } else {
382
+ // otherwise, add directly
383
+ out[name] = preset_added;
384
+ }
385
+ }
386
+ return out;
387
+ }
388
+
389
+ common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
390
+ common_presets out;
391
+ for (const auto & [name, preset] : presets) {
392
+ common_preset tmp = base; // copy
393
+ tmp.name = name;
394
+ tmp.merge(preset);
395
+ out[name] = std::move(tmp);
396
+ }
397
+ return out;
398
+ }
@@ -13,20 +13,62 @@
13
13
 
14
14
  constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
15
15
 
16
+ struct common_preset_context;
17
+
16
18
  struct common_preset {
17
19
  std::string name;
18
- // TODO: support repeated args in the future
20
+
21
+ // options are stored as common_arg to string mapping, representing CLI arg and its value
19
22
  std::map<common_arg, std::string> options;
20
23
 
21
24
  // convert preset to CLI argument list
22
- std::vector<std::string> to_args() const;
25
+ std::vector<std::string> to_args(const std::string & bin_path = "") const;
23
26
 
24
27
  // convert preset to INI format string
25
28
  std::string to_ini() const;
26
29
 
27
30
  // TODO: maybe implement to_env() if needed
31
+
32
+ // modify preset options where argument is identified by its env variable
33
+ void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
34
+
35
+ // unset option by its env variable
36
+ void unset_option(const std::string & env);
37
+
38
+ // get option value by its env variable, return false if not found
39
+ bool get_option(const std::string & env, std::string & value) const;
40
+
41
+ // merge another preset into this one, overwriting existing options
42
+ void merge(const common_preset & other);
28
43
  };
29
44
 
30
45
  // interface for multiple presets in one file
31
46
  using common_presets = std::map<std::string, common_preset>;
32
- common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
47
+
48
+ // context for loading and editing presets
49
+ struct common_preset_context {
50
+ common_params default_params; // unused for now
51
+ common_params_context ctx_params;
52
+ std::map<std::string, common_arg> key_to_opt;
53
+ common_preset_context(llama_example ex);
54
+
55
+ // load presets from INI file
56
+ common_presets load_from_ini(const std::string & path, common_preset & global) const;
57
+
58
+ // generate presets from cached models
59
+ common_presets load_from_cache() const;
60
+
61
+ // generate presets from local models directory
62
+ // for the directory structure, see "Using multiple models" in server/README.md
63
+ common_presets load_from_models_dir(const std::string & models_dir) const;
64
+
65
+ // generate one preset from CLI arguments
66
+ common_preset load_from_args(int argc, char ** argv) const;
67
+
68
+ // cascade multiple presets if exist on both: base < added
69
+ // if preset does not exist in base, it will be added without modification
70
+ common_presets cascade(const common_presets & base, const common_presets & added) const;
71
+
72
+ // apply presets over a base preset (same idea as CSS cascading)
73
+ common_presets cascade(const common_preset & base, const common_presets & presets) const;
74
+ };
@@ -254,6 +254,7 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
254
254
  "gmml: OpenCL API version to target")
255
255
 
256
256
  option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
257
+ set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
257
258
 
258
259
  # toolchain for vulkan-shaders-gen
259
260
  set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -1086,10 +1086,10 @@ bool llama_model_loader::load_all_data(
1086
1086
  } else {
1087
1087
  // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1088
1088
  if (upload_backend) {
1089
- auto offset = (off_t) weight->offs;
1089
+ size_t offset = weight->offs;
1090
1090
  alignment = file->read_alignment();
1091
- off_t aligned_offset = offset & ~(alignment - 1);
1092
- off_t offset_from_alignment = offset - aligned_offset;
1091
+ size_t aligned_offset = offset & ~(alignment - 1);
1092
+ size_t offset_from_alignment = offset - aligned_offset;
1093
1093
  file->seek(aligned_offset, SEEK_SET);
1094
1094
 
1095
1095
  // Calculate aligned read boundaries