@fugood/llama.node 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CMakeLists.txt +1 -1
  2. package/lib/binding.js +3 -0
  3. package/lib/binding.ts +10 -0
  4. package/lib/index.js +9 -0
  5. package/lib/index.ts +10 -0
  6. package/package.json +15 -15
  7. package/scripts/llama.cpp.patch +25 -11
  8. package/src/LlamaContext.cpp +24 -0
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/CMakeLists.txt +21 -6
  11. package/src/llama.cpp/common/CMakeLists.txt +6 -0
  12. package/src/llama.cpp/common/arg.cpp +83 -22
  13. package/src/llama.cpp/common/chat-parser.cpp +40 -0
  14. package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
  15. package/src/llama.cpp/common/chat-peg-parser.h +105 -0
  16. package/src/llama.cpp/common/chat.cpp +40 -29
  17. package/src/llama.cpp/common/chat.h +10 -1
  18. package/src/llama.cpp/common/common.cpp +70 -7
  19. package/src/llama.cpp/common/common.h +23 -5
  20. package/src/llama.cpp/common/download.cpp +18 -8
  21. package/src/llama.cpp/common/download.h +3 -1
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  23. package/src/llama.cpp/common/log.cpp +18 -27
  24. package/src/llama.cpp/common/log.h +19 -12
  25. package/src/llama.cpp/common/peg-parser.cpp +1712 -0
  26. package/src/llama.cpp/common/peg-parser.h +459 -0
  27. package/src/llama.cpp/common/unicode.cpp +64 -0
  28. package/src/llama.cpp/common/unicode.h +22 -0
  29. package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
  30. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
  31. package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +29 -2
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
  37. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
  39. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-arch.cpp +30 -1
  43. package/src/llama.cpp/src/llama-arch.h +3 -0
  44. package/src/llama.cpp/src/llama-graph.cpp +3 -6
  45. package/src/llama.cpp/src/llama-hparams.h +2 -2
  46. package/src/llama.cpp/src/llama-impl.h +1 -1
  47. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  48. package/src/llama.cpp/src/llama-model.cpp +54 -6
  49. package/src/llama.cpp/src/llama-quant.cpp +0 -29
  50. package/src/llama.cpp/src/llama-vocab.cpp +1 -2
  51. package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
  52. package/src/llama.cpp/src/models/mistral3.cpp +160 -0
  53. package/src/llama.cpp/src/models/models.h +4 -0
  54. package/src/llama.cpp/src/unicode.cpp +2 -2
package/CMakeLists.txt CHANGED
@@ -99,7 +99,7 @@ endif()
99
99
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
100
100
 
101
101
  if (MINGW)
102
- add_definitions(-D_WIN32_WINNT=0x0601)
102
+ add_definitions(-D_WIN32_WINNT=0x0A00)
103
103
  endif()
104
104
 
105
105
  # VULKAN_SDK
package/lib/binding.js CHANGED
@@ -64,6 +64,9 @@ const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
64
64
  /* no-op */
65
65
  }
66
66
  }
67
+ const nDev = process.env.GGML_HEXAGON_NDEV;
68
+ if (!nDev)
69
+ process.env.GGML_HEXAGON_NDEV = '16';
67
70
  }
68
71
  let module = yield loadPlatformPackage(packageName);
69
72
  if (module) {
package/lib/binding.ts CHANGED
@@ -565,6 +565,14 @@ export interface LlamaContext {
565
565
  */
566
566
  cancelRequest(requestId: number): void
567
567
 
568
+ /**
569
+ * Clear the KV and recurrent caches.
570
+ * This is faster than recreating the context and useful for preventing
571
+ * cache contamination between chat sessions.
572
+ * @param clearData If true, also clears the cache data (default: false)
573
+ */
574
+ clearCache(clearData?: boolean): void
575
+
568
576
  // static
569
577
  loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
570
578
  toggleNativeLog(
@@ -616,6 +624,8 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
616
624
  /* no-op */
617
625
  }
618
626
  }
627
+ const nDev = process.env.GGML_HEXAGON_NDEV
628
+ if (!nDev) process.env.GGML_HEXAGON_NDEV = '16'
619
629
  }
620
630
 
621
631
  let module = await loadPlatformPackage(packageName)
package/lib/index.js CHANGED
@@ -195,6 +195,15 @@ class LlamaContextWrapper {
195
195
  decodeAudioTokens(tokens) {
196
196
  return this.ctx.decodeAudioTokens(tokens);
197
197
  }
198
+ /**
199
+ * Clear the KV and recurrent caches.
200
+ * This is faster than recreating the context and useful for preventing
201
+ * cache contamination between chat sessions.
202
+ * @param clearData If true, also clears the cache data (default: false)
203
+ */
204
+ clearCache(clearData) {
205
+ this.ctx.clearCache(clearData);
206
+ }
198
207
  }
199
208
  const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
200
209
  var _a, _b;
package/lib/index.ts CHANGED
@@ -299,6 +299,16 @@ class LlamaContextWrapper {
299
299
  decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
300
300
  return this.ctx.decodeAudioTokens(tokens)
301
301
  }
302
+
303
+ /**
304
+ * Clear the KV and recurrent caches.
305
+ * This is faster than recreating the context and useful for preventing
306
+ * cache contamination between chat sessions.
307
+ * @param clearData If true, also clears the cache data (default: false)
308
+ */
309
+ clearCache(clearData?: boolean): void {
310
+ this.ctx.clearCache(clearData)
311
+ }
302
312
  }
303
313
 
304
314
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.2",
4
+ "version": "1.4.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.2",
76
- "@fugood/node-llama-darwin-x64": "1.4.2",
77
- "@fugood/node-llama-linux-arm64": "1.4.2",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.2",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.2",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.2",
81
- "@fugood/node-llama-linux-x64": "1.4.2",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.2",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.2",
84
- "@fugood/node-llama-win32-arm64": "1.4.2",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.2",
86
- "@fugood/node-llama-win32-x64": "1.4.2",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.2",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.2"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.4",
76
+ "@fugood/node-llama-darwin-x64": "1.4.4",
77
+ "@fugood/node-llama-linux-arm64": "1.4.4",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.4",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.4",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.4",
81
+ "@fugood/node-llama-linux-x64": "1.4.4",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.4",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.4",
84
+ "@fugood/node-llama-win32-arm64": "1.4.4",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.4",
86
+ "@fugood/node-llama-win32-x64": "1.4.4",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.4",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.4"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -1,8 +1,8 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index bb168e835..cfc0e2c2e 100644
2
+ index 377b26846..1873b5206 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -149,9 +149,16 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
@@ -20,8 +20,22 @@ index bb168e835..cfc0e2c2e 100644
20
20
 
21
21
 
22
22
  #
23
+ diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
24
+ index 74a7b6a46..7b7a1bd50 100644
25
+ --- a/src/llama.cpp/common/chat-peg-parser.cpp
26
+ +++ b/src/llama.cpp/common/chat-peg-parser.cpp
27
+ @@ -1,9 +1,5 @@
28
+ #include "chat-peg-parser.h"
29
+
30
+ -#include <nlohmann/json.hpp>
31
+ -
32
+ -using json = nlohmann::json;
33
+ -
34
+ static std::string_view trim_trailing_space(std::string_view sv) {
35
+ while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
36
+ sv.remove_suffix(1);
23
37
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
24
- index b4a0f985e..2383d2ea9 100644
38
+ index 41a5bb42d..da5cf4b94 100644
25
39
  --- a/src/llama.cpp/common/chat.cpp
26
40
  +++ b/src/llama.cpp/common/chat.cpp
27
41
  @@ -6,9 +6,6 @@
@@ -34,7 +48,7 @@ index b4a0f985e..2383d2ea9 100644
34
48
  #include <algorithm>
35
49
  #include <cstdio>
36
50
  #include <cctype>
37
- @@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
51
+ @@ -134,16 +131,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
38
52
  return diffs;
39
53
  }
40
54
 
@@ -51,7 +65,7 @@ index b4a0f985e..2383d2ea9 100644
51
65
  struct templates_params {
52
66
  json messages;
53
67
  json tools;
54
- @@ -709,7 +696,7 @@ static std::string apply(
68
+ @@ -720,7 +707,7 @@ static std::string apply(
55
69
  tmpl_inputs.extra_context.merge_patch(*additional_context);
56
70
  }
57
71
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -61,10 +75,10 @@ index b4a0f985e..2383d2ea9 100644
61
75
  minja::chat_template_options tmpl_opts;
62
76
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
63
77
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
64
- index 754c411e2..71241a6cc 100644
78
+ index 6085510a4..263076ce2 100644
65
79
  --- a/src/llama.cpp/common/chat.h
66
80
  +++ b/src/llama.cpp/common/chat.h
67
- @@ -9,7 +9,18 @@
81
+ @@ -10,7 +10,18 @@
68
82
  #include <vector>
69
83
  #include <map>
70
84
 
@@ -85,10 +99,10 @@ index 754c411e2..71241a6cc 100644
85
99
  struct common_chat_tool_call {
86
100
  std::string name;
87
101
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
88
- index 0d7fd9a93..6bf3cc7ab 100644
102
+ index f07af1d86..1b10c7b13 100644
89
103
  --- a/src/llama.cpp/common/common.cpp
90
104
  +++ b/src/llama.cpp/common/common.cpp
91
- @@ -1217,6 +1217,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
105
+ @@ -1236,6 +1236,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
92
106
  mparams.n_gpu_layers = params.n_gpu_layers;
93
107
  }
94
108
 
@@ -97,10 +111,10 @@ index 0d7fd9a93..6bf3cc7ab 100644
97
111
  mparams.split_mode = params.split_mode;
98
112
  mparams.tensor_split = params.tensor_split;
99
113
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
100
- index 2f23d0baa..e4e6c795e 100644
114
+ index 179113a4d..78aa24bc3 100644
101
115
  --- a/src/llama.cpp/common/common.h
102
116
  +++ b/src/llama.cpp/common/common.h
103
- @@ -299,6 +299,7 @@ struct lr_opt {
117
+ @@ -302,6 +302,7 @@ struct lr_opt {
104
118
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
105
119
 
106
120
  struct common_params {
@@ -200,6 +200,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
200
200
  static_cast<napi_property_attributes>(napi_enumerable)),
201
201
  InstanceMethod<&LlamaContext::CancelRequest>(
202
202
  "cancelRequest",
203
+ static_cast<napi_property_attributes>(napi_enumerable)),
204
+ InstanceMethod<&LlamaContext::ClearCache>(
205
+ "clearCache",
203
206
  static_cast<napi_property_attributes>(napi_enumerable))});
204
207
  Napi::FunctionReference *constructor = new Napi::FunctionReference();
205
208
  *constructor = Napi::Persistent(func);
@@ -1505,3 +1508,24 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1505
1508
  worker->Queue();
1506
1509
  return worker->Promise();
1507
1510
  }
1511
+
1512
+ // clearCache(clearData?: boolean): void
1513
+ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
1514
+ Napi::Env env = info.Env();
1515
+ if (!_rn_ctx) {
1516
+ Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
1517
+ return;
1518
+ }
1519
+ if (_rn_ctx->completion != nullptr && _rn_ctx->completion->is_predicting) {
1520
+ Napi::TypeError::New(env, "Cannot clear cache while completion is in progress")
1521
+ .ThrowAsJavaScriptException();
1522
+ return;
1523
+ }
1524
+
1525
+ bool clear_data = false;
1526
+ if (info.Length() >= 1 && info[0].IsBoolean()) {
1527
+ clear_data = info[0].ToBoolean().Value();
1528
+ }
1529
+
1530
+ _rn_ctx->clearCache(clear_data);
1531
+ }
@@ -69,6 +69,9 @@ private:
69
69
  Napi::Value QueueRerank(const Napi::CallbackInfo &info);
70
70
  void CancelRequest(const Napi::CallbackInfo &info);
71
71
 
72
+ // Cache management
73
+ void ClearCache(const Napi::CallbackInfo &info);
74
+
72
75
  std::string _info;
73
76
  std::vector<std::string> _used_devices;
74
77
  Napi::Object _meta;
@@ -33,10 +33,24 @@ endif()
33
33
 
34
34
  option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
35
35
 
36
+ option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
37
+
36
38
  if (EMSCRIPTEN)
37
39
  set(BUILD_SHARED_LIBS_DEFAULT OFF)
38
40
 
39
- option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
41
+ # Use 64-bit memory to support backend_get_memory queries
42
+ # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
43
+ if (LLAMA_WASM_MEM64)
44
+ add_compile_options("-sMEMORY64=1")
45
+ add_link_options("-sMEMORY64=1")
46
+ endif()
47
+ add_link_options("-sALLOW_MEMORY_GROWTH=1")
48
+
49
+ option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
50
+ option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
51
+ if (LLAMA_BUILD_HTML)
52
+ set(CMAKE_EXECUTABLE_SUFFIX ".html")
53
+ endif()
40
54
  else()
41
55
  if (MINGW)
42
56
  set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -58,6 +72,12 @@ if (MSVC)
58
72
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
59
73
  endif()
60
74
 
75
+ if (LLAMA_STANDALONE)
76
+ # enable parallel builds for msbuild
77
+ list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
78
+ list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
79
+ endif()
80
+
61
81
  if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
62
82
  set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
63
83
  else()
@@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
179
199
  # ... otherwise assume ggml is added by a parent CMakeLists.txt
180
200
  endif()
181
201
 
182
- if (MINGW)
183
- # Target Windows 8 for PrefetchVirtualMemory
184
- add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
185
- endif()
186
-
187
202
  #
188
203
  # build the library
189
204
  #
@@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
52
52
  chat-parser.h
53
53
  chat-parser-xml-toolcall.h
54
54
  chat-parser-xml-toolcall.cpp
55
+ chat-peg-parser.cpp
56
+ chat-peg-parser.h
55
57
  chat.cpp
56
58
  chat.h
57
59
  common.cpp
@@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
69
71
  log.h
70
72
  ngram-cache.cpp
71
73
  ngram-cache.h
74
+ peg-parser.cpp
75
+ peg-parser.h
72
76
  regex-partial.cpp
73
77
  regex-partial.h
74
78
  sampling.cpp
75
79
  sampling.h
76
80
  speculative.cpp
77
81
  speculative.h
82
+ unicode.cpp
83
+ unicode.h
78
84
  )
79
85
 
80
86
  if (BUILD_SHARED_LIBS)
@@ -30,6 +30,7 @@
30
30
  #include <thread> // for hardware_concurrency
31
31
  #include <vector>
32
32
 
33
+ #ifndef __EMSCRIPTEN__
33
34
  #ifdef __linux__
34
35
  #include <linux/limits.h>
35
36
  #elif defined(_WIN32)
@@ -41,6 +42,8 @@
41
42
  #else
42
43
  #include <sys/syslimits.h>
43
44
  #endif
45
+ #endif
46
+
44
47
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
45
48
 
46
49
  using json = nlohmann::ordered_json;
@@ -212,13 +215,13 @@ struct handle_model_result {
212
215
  static handle_model_result common_params_handle_model(
213
216
  struct common_params_model & model,
214
217
  const std::string & bearer_token,
215
- const std::string & model_path_default,
216
218
  bool offline) {
217
219
  handle_model_result result;
218
220
  // handle pre-fill default model path and url based on hf_repo and hf_file
219
221
  {
220
222
  if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
221
223
  model.path = common_docker_resolve_model(model.docker_repo);
224
+ model.name = model.docker_repo; // set name for consistency
222
225
  } else if (!model.hf_repo.empty()) {
223
226
  // short-hand to avoid specifying --hf-file -> default it to --model
224
227
  if (model.hf_file.empty()) {
@@ -227,7 +230,8 @@ static handle_model_result common_params_handle_model(
227
230
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
228
231
  exit(1); // built without CURL, error message already printed
229
232
  }
230
- model.hf_repo = auto_detected.repo;
233
+ model.name = model.hf_repo; // repo name with tag
234
+ model.hf_repo = auto_detected.repo; // repo name without tag
231
235
  model.hf_file = auto_detected.ggufFile;
232
236
  if (!auto_detected.mmprojFile.empty()) {
233
237
  result.found_mmproj = true;
@@ -257,8 +261,6 @@ static handle_model_result common_params_handle_model(
257
261
  model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
258
262
  }
259
263
 
260
- } else if (model.path.empty()) {
261
- model.path = model_path_default;
262
264
  }
263
265
  }
264
266
 
@@ -405,7 +407,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
405
407
 
406
408
  // handle model and download
407
409
  {
408
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
410
+ auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
409
411
  if (params.no_mmproj) {
410
412
  params.mmproj = {};
411
413
  } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -415,12 +417,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
415
417
  // only download mmproj if the current example is using it
416
418
  for (auto & ex : mmproj_examples) {
417
419
  if (ctx_arg.ex == ex) {
418
- common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
420
+ common_params_handle_model(params.mmproj, params.hf_token, params.offline);
419
421
  break;
420
422
  }
421
423
  }
422
- common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
423
- common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
424
+ common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
425
+ common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
426
+ }
427
+
428
+ // model is required (except for server)
429
+ // TODO @ngxson : maybe show a list of available models in CLI in this case
430
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
431
+ throw std::invalid_argument("error: --model is required\n");
424
432
  }
425
433
 
426
434
  if (params.escape) {
@@ -700,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
700
708
  params.use_jinja = true;
701
709
  }
702
710
 
711
+ params.use_color = tty_can_use_colors();
712
+
703
713
  // load dynamic backends
704
714
  ggml_backend_load_all();
705
715
 
@@ -782,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
782
792
  }
783
793
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
784
794
  add_opt(common_arg(
785
- {"-co", "--color"},
786
- string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
787
- [](common_params & params) {
788
- params.use_color = true;
795
+ {"-co", "--color"}, "[on|off|auto]",
796
+ "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
797
+ "'auto' enables colors when output is to a terminal",
798
+ [](common_params & params, const std::string & value) {
799
+ if (is_truthy(value)) {
800
+ params.use_color = true;
801
+ } else if (is_falsey(value)) {
802
+ params.use_color = false;
803
+ } else if (is_autoy(value)) {
804
+ params.use_color = tty_can_use_colors();
805
+ } else {
806
+ throw std::invalid_argument(
807
+ string_format("error: unknown value for --color: '%s'\n", value.c_str()));
808
+ }
789
809
  }
790
810
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
791
811
  add_opt(common_arg(
@@ -1014,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1014
1034
  params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1015
1035
  } else {
1016
1036
  throw std::runtime_error(
1017
- string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1037
+ string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
1018
1038
  }
1019
1039
  }).set_env("LLAMA_ARG_FLASH_ATTN"));
1020
1040
  add_opt(common_arg(
@@ -1221,7 +1241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1221
1241
  [](common_params & params) {
1222
1242
  params.warmup = false;
1223
1243
  }
1224
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1244
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1225
1245
  add_opt(common_arg(
1226
1246
  {"--spm-infill"},
1227
1247
  string_format(
@@ -2090,11 +2110,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2090
2110
  add_opt(common_arg(
2091
2111
  {"-m", "--model"}, "FNAME",
2092
2112
  ex == LLAMA_EXAMPLE_EXPORT_LORA
2093
- ? std::string("model path from which to load base model")
2094
- : string_format(
2095
- "model path (default: `models/$filename` with filename from `--hf-file` "
2096
- "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
2097
- ),
2113
+ ? "model path from which to load base model"
2114
+ : "model path to load",
2098
2115
  [](common_params & params, const std::string & value) {
2099
2116
  params.model.path = value;
2100
2117
  }
@@ -2486,12 +2503,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2486
2503
  "path to save slot kv cache (default: disabled)",
2487
2504
  [](common_params & params, const std::string & value) {
2488
2505
  params.slot_save_path = value;
2506
+ if (!fs_is_directory(params.slot_save_path)) {
2507
+ throw std::invalid_argument("not a directory: " + value);
2508
+ }
2489
2509
  // if doesn't end with DIRECTORY_SEPARATOR, add it
2490
2510
  if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
2491
2511
  params.slot_save_path += DIRECTORY_SEPARATOR;
2492
2512
  }
2493
2513
  }
2494
2514
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
2515
+ add_opt(common_arg(
2516
+ {"--media-path"}, "PATH",
2517
+ "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
2518
+ [](common_params & params, const std::string & value) {
2519
+ params.media_path = value;
2520
+ if (!fs_is_directory(params.media_path)) {
2521
+ throw std::invalid_argument("not a directory: " + value);
2522
+ }
2523
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
2524
+ if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
2525
+ params.media_path += DIRECTORY_SEPARATOR;
2526
+ }
2527
+ }
2528
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2529
+ add_opt(common_arg(
2530
+ {"--models-dir"}, "PATH",
2531
+ "directory containing models for the router server (default: disabled)",
2532
+ [](common_params & params, const std::string & value) {
2533
+ params.models_dir = value;
2534
+ }
2535
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
2536
+ add_opt(common_arg(
2537
+ {"--models-max"}, "N",
2538
+ string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
2539
+ [](common_params & params, int value) {
2540
+ params.models_max = value;
2541
+ }
2542
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
2543
+ add_opt(common_arg(
2544
+ {"--no-models-autoload"},
2545
+ "disables automatic loading of models (default: enabled)",
2546
+ [](common_params & params) {
2547
+ params.models_autoload = false;
2548
+ }
2549
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
2495
2550
  add_opt(common_arg(
2496
2551
  {"--jinja"},
2497
2552
  string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
@@ -2639,7 +2694,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2639
2694
  [](common_params &, const std::string & value) {
2640
2695
  common_log_set_file(common_log_main(), value.c_str());
2641
2696
  }
2642
- ));
2697
+ ).set_env("LLAMA_LOG_FILE"));
2643
2698
  add_opt(common_arg(
2644
2699
  {"--log-colors"}, "[on|off|auto]",
2645
2700
  "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -2653,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2653
2708
  common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
2654
2709
  } else {
2655
2710
  throw std::invalid_argument(
2656
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
2711
+ string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
2657
2712
  }
2658
2713
  }
2659
2714
  ).set_env("LLAMA_LOG_COLORS"));
@@ -2674,7 +2729,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2674
2729
  ).set_env("LLAMA_OFFLINE"));
2675
2730
  add_opt(common_arg(
2676
2731
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
2677
- "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
2732
+ string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
2733
+ " - 0: generic output\n"
2734
+ " - 1: error\n"
2735
+ " - 2: warning\n"
2736
+ " - 3: info\n"
2737
+ " - 4: debug\n"
2738
+ "(default: %d)\n", params.verbosity),
2678
2739
  [](common_params & params, int value) {
2679
2740
  params.verbosity = value;
2680
2741
  common_log_set_verbosity_thold(value);
@@ -1,6 +1,8 @@
1
1
  #include "chat-parser.h"
2
+ #include "chat-peg-parser.h"
2
3
  #include "common.h"
3
4
  #include "log.h"
5
+ #include "peg-parser.h"
4
6
  #include "regex-partial.h"
5
7
 
6
8
  #include <algorithm>
@@ -1483,6 +1485,11 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1483
1485
  }
1484
1486
 
1485
1487
  common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
1488
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
1489
+ syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
1490
+ syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
1491
+ return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
1492
+ }
1486
1493
  common_chat_msg_parser builder(input, is_partial, syntax);
1487
1494
  try {
1488
1495
  common_chat_parse(builder);
@@ -1500,3 +1507,36 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
1500
1507
  }
1501
1508
  return msg;
1502
1509
  }
1510
+
1511
+ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
1512
+ if (parser.empty()) {
1513
+ throw std::runtime_error("Failed to parse due to missing parser definition.");
1514
+ }
1515
+
1516
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
1517
+
1518
+ common_peg_parse_context ctx(input, is_partial);
1519
+ auto result = parser.parse(ctx);
1520
+ if (result.fail()) {
1521
+ throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
1522
+ }
1523
+
1524
+ common_chat_msg msg;
1525
+ msg.role = "assistant";
1526
+
1527
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
1528
+ auto mapper = common_chat_peg_native_mapper(msg);
1529
+ mapper.from_ast(ctx.ast, result);
1530
+ } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
1531
+ auto mapper = common_chat_peg_constructed_mapper(msg);
1532
+ mapper.from_ast(ctx.ast, result);
1533
+ } else {
1534
+ // Generic mapper
1535
+ auto mapper = common_chat_peg_mapper(msg);
1536
+ mapper.from_ast(ctx.ast, result);
1537
+ }
1538
+ if (!is_partial) {
1539
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1540
+ }
1541
+ return msg;
1542
+ }