@fugood/llama.node 1.4.14 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -515,9 +515,20 @@ export interface LlamaContext {
515
515
  /**
516
516
  * Initialize multimodal support with a mmproj file
517
517
  * @param options Object containing path and optional use_gpu flag
518
+ * @param options.path Path to the multimodal projector model file (mmproj)
519
+ * @param options.use_gpu Whether to use GPU for multimodal processing (default: true)
520
+ * @param options.image_min_tokens Minimum number of tokens for image input (for dynamic resolution models)
521
+ * @param options.image_max_tokens Maximum number of tokens for image input (for dynamic resolution models).
522
+ * Lower values reduce memory usage and improve speed for high-resolution images.
523
+ * Recommended: 256-512 for faster inference, up to 4096 for maximum detail.
518
524
  * @returns boolean indicating if initialization was successful
519
525
  */
520
- initMultimodal(options: { path: string; use_gpu?: boolean }): boolean
526
+ initMultimodal(options: {
527
+ path: string
528
+ use_gpu?: boolean
529
+ image_min_tokens?: number
530
+ image_max_tokens?: number
531
+ }): boolean
521
532
 
522
533
  /**
523
534
  * Check if multimodal support is enabled
package/lib/index.ts CHANGED
@@ -254,7 +254,12 @@ class LlamaContextWrapper {
254
254
  return this.ctx.getLoadedLoraAdapters()
255
255
  }
256
256
 
257
- initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
257
+ initMultimodal(options: {
258
+ path: string
259
+ use_gpu?: boolean
260
+ image_min_tokens?: number
261
+ image_max_tokens?: number
262
+ }): boolean {
258
263
  return this.ctx.initMultimodal(options)
259
264
  }
260
265
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.14",
4
+ "version": "1.4.15",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.14",
76
- "@fugood/node-llama-darwin-x64": "1.4.14",
77
- "@fugood/node-llama-linux-arm64": "1.4.14",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.14",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
81
- "@fugood/node-llama-linux-x64": "1.4.14",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.14",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.14",
84
- "@fugood/node-llama-win32-arm64": "1.4.14",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
86
- "@fugood/node-llama-win32-x64": "1.4.14",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.14",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.14"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.15",
76
+ "@fugood/node-llama-darwin-x64": "1.4.15",
77
+ "@fugood/node-llama-linux-arm64": "1.4.15",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.15",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.15",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.15",
81
+ "@fugood/node-llama-linux-x64": "1.4.15",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.15",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.15",
84
+ "@fugood/node-llama-win32-arm64": "1.4.15",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.15",
86
+ "@fugood/node-llama-win32-x64": "1.4.15",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.15",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.15"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -1,8 +1,8 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index f7b99159e..fa37fed19 100644
2
+ index 723973ed7..e4b2c6537 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -154,8 +154,14 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -146,4 +146,11 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
@@ -13,11 +13,8 @@ index f7b99159e..fa37fed19 100644
13
13
  +else()
14
14
  + set(LLAMA_COMMON_WIN_LIBS "")
15
15
  +endif()
16
-
16
+ +
17
17
  +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
18
-
19
- #
20
- # copy the license files
21
18
  diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
22
19
  index 1bcba9cd8..b7cd68734 100644
23
20
  --- a/src/llama.cpp/common/chat-peg-parser.cpp
@@ -32,7 +29,7 @@ index 1bcba9cd8..b7cd68734 100644
32
29
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
33
30
  int count = 0;
34
31
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
35
- index 22e527bab..c3d0affca 100644
32
+ index d531388bc..e6712b368 100644
36
33
  --- a/src/llama.cpp/common/chat.cpp
37
34
  +++ b/src/llama.cpp/common/chat.cpp
38
35
  @@ -7,9 +7,6 @@
@@ -62,7 +59,7 @@ index 22e527bab..c3d0affca 100644
62
59
  struct templates_params {
63
60
  json messages;
64
61
  json tools;
65
- @@ -752,7 +739,7 @@ static std::string apply(
62
+ @@ -753,7 +740,7 @@ static std::string apply(
66
63
  tmpl_inputs.extra_context.merge_patch(*additional_context);
67
64
  }
68
65
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -72,7 +69,7 @@ index 22e527bab..c3d0affca 100644
72
69
  minja::chat_template_options tmpl_opts;
73
70
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
74
71
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
75
- index 8bd4a325f..333b3301f 100644
72
+ index 454085e90..e01390cf9 100644
76
73
  --- a/src/llama.cpp/common/chat.h
77
74
  +++ b/src/llama.cpp/common/chat.h
78
75
  @@ -10,7 +10,18 @@
@@ -108,10 +105,10 @@ index 744f0b4ee..04fcebb9e 100644
108
105
  mparams.main_gpu = params.main_gpu;
109
106
  mparams.split_mode = params.split_mode;
110
107
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index 7794c0268..5b77ae0c3 100644
108
+ index e60087dea..c21797cd8 100644
112
109
  --- a/src/llama.cpp/common/common.h
113
110
  +++ b/src/llama.cpp/common/common.h
114
- @@ -310,6 +310,7 @@ struct lr_opt {
111
+ @@ -311,6 +311,7 @@ struct lr_opt {
115
112
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
116
113
 
117
114
  struct common_params {
@@ -1333,7 +1333,7 @@ extern "C" void cleanup_logging() {
1333
1333
  }
1334
1334
 
1335
1335
 
1336
- // initMultimodal(options: { path: string, use_gpu?: boolean }): boolean
1336
+ // initMultimodal(options: { path: string, use_gpu?: boolean, image_min_tokens?: number, image_max_tokens?: number }): boolean
1337
1337
  Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
1338
1338
  Napi::Env env = info.Env();
1339
1339
 
@@ -1345,6 +1345,15 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
1345
1345
  auto options = info[0].As<Napi::Object>();
1346
1346
  auto mmproj_path = options.Get("path").ToString().Utf8Value();
1347
1347
  auto use_gpu = options.Get("use_gpu").ToBoolean().Value();
1348
+ int image_min_tokens = -1;
1349
+ int image_max_tokens = -1;
1350
+
1351
+ if (options.Has("image_min_tokens") && options.Get("image_min_tokens").IsNumber()) {
1352
+ image_min_tokens = options.Get("image_min_tokens").ToNumber().Int32Value();
1353
+ }
1354
+ if (options.Has("image_max_tokens") && options.Get("image_max_tokens").IsNumber()) {
1355
+ image_max_tokens = options.Get("image_max_tokens").ToNumber().Int32Value();
1356
+ }
1348
1357
 
1349
1358
  if (mmproj_path.empty()) {
1350
1359
  Napi::TypeError::New(env, "mmproj path is required")
@@ -1360,7 +1369,7 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
1360
1369
 
1361
1370
  // Disable ctx_shift before initializing multimodal
1362
1371
  _rn_ctx->params.ctx_shift = false;
1363
- bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu);
1372
+ bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu, image_min_tokens, image_max_tokens);
1364
1373
  if (!result) {
1365
1374
  Napi::Error::New(env, "Failed to initialize multimodal context")
1366
1375
  .ThrowAsJavaScriptException();
@@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
111
111
  option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
112
112
 
113
113
  # 3rd party libs
114
- option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
115
- option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
116
- option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
114
+ option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
115
+ option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
117
116
  option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
118
117
 
118
+ # deprecated
119
+ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
120
+ if (LLAMA_CURL)
121
+ message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
122
+ endif()
123
+
119
124
  # Required for relocatable CMake package
120
125
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
121
126
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -182,6 +187,9 @@ if (NOT MSVC)
182
187
  endif()
183
188
  endif()
184
189
 
190
+ include("cmake/license.cmake")
191
+ license_add_file("llama.cpp" "LICENSE")
192
+
185
193
  #
186
194
  # 3rd-party
187
195
  #
@@ -209,11 +217,6 @@ add_subdirectory(src)
209
217
  # utils, programs, examples and tests
210
218
  #
211
219
 
212
- if (NOT LLAMA_BUILD_COMMON)
213
- message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
214
- set(LLAMA_CURL OFF)
215
- endif()
216
-
217
220
  if (LLAMA_BUILD_COMMON)
218
221
  add_subdirectory(common)
219
222
  if (LLAMA_HTTPLIB)
@@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
235
238
  add_subdirectory(tools)
236
239
  endif()
237
240
 
241
+ # Automatically add all files from the 'licenses' directory
242
+ file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
243
+
244
+ foreach(FILE_PATH ${EXTRA_LICENSES})
245
+ get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
246
+ string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
247
+ license_add_file("${NAME}" "${FILE_PATH}")
248
+ endforeach()
249
+
250
+ if (LLAMA_BUILD_COMMON)
251
+ license_generate(common)
252
+ endif()
253
+
238
254
  #
239
255
  # install
240
256
  #
@@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
60
60
  common.h
61
61
  console.cpp
62
62
  console.h
63
+ debug.cpp
64
+ debug.h
63
65
  download.cpp
64
66
  download.h
65
67
  http.h
@@ -95,17 +97,7 @@ endif()
95
97
  # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
96
98
  set(LLAMA_COMMON_EXTRA_LIBS build_info)
97
99
 
98
- if (LLAMA_CURL)
99
- # Use curl to download model url
100
- find_package(CURL)
101
- if (NOT CURL_FOUND)
102
- message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
103
- endif()
104
- target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
105
- include_directories(${CURL_INCLUDE_DIRS})
106
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
107
- elseif (LLAMA_HTTPLIB)
108
- # otherwise, use cpp-httplib
100
+ if (LLAMA_HTTPLIB)
109
101
  target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
110
102
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
111
103
  endif()
@@ -162,26 +154,3 @@ else()
162
154
  endif()
163
155
 
164
156
  target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
165
-
166
- #
167
- # copy the license files
168
- #
169
-
170
- # Check if running in GitHub Actions
171
- if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
172
- message(STATUS "Running inside GitHub Actions - copying license files")
173
-
174
- # Copy all files from licenses/ to build/bin/
175
- file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
176
- foreach(LICENSE_FILE ${LICENSE_FILES})
177
- get_filename_component(FILENAME ${LICENSE_FILE} NAME)
178
- add_custom_command(
179
- POST_BUILD
180
- TARGET ${TARGET}
181
- COMMAND ${CMAKE_COMMAND} -E copy_if_different
182
- "${LICENSE_FILE}"
183
- "$<TARGET_FILE_DIR:llama>/${FILENAME}"
184
- COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
185
- message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
186
- endforeach()
187
- endif()
@@ -2,10 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "common.h"
5
+ #include "download.h"
5
6
  #include "json-schema-to-grammar.h"
6
7
  #include "log.h"
7
8
  #include "sampling.h"
8
- #include "download.h"
9
9
  #include "preset.h"
10
10
 
11
11
  // fix problem with std::min and std::max
@@ -48,6 +48,8 @@
48
48
 
49
49
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
50
50
 
51
+ extern const char * LICENSES[];
52
+
51
53
  using json = nlohmann::ordered_json;
52
54
  using namespace common_arg_utils;
53
55
 
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
279
281
  static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
280
282
  GGML_ASSERT(!params.model.hf_repo.empty());
281
283
 
284
+ // the returned hf_repo is without tag
285
+ auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
286
+
287
+ // "latest" tag (default if not specified) is translated to "default" preset
288
+ if (hf_tag == "latest") {
289
+ hf_tag = "default";
290
+ }
291
+
282
292
  const bool offline = params.offline;
283
293
  std::string model_endpoint = get_model_endpoint();
284
- auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
294
+ auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
285
295
 
286
296
  // prepare local path for caching
287
- auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
297
+ auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
288
298
  auto preset_path = fs_get_cache_file(preset_fname);
289
299
  const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
290
300
  const bool has_preset = status >= 200 && status < 400;
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
293
303
  if (has_preset) {
294
304
  LOG_INF("applying remote preset from %s\n", preset_url.c_str());
295
305
  common_preset_context ctx(ex, /* only_remote_allowed */ true);
296
- common_preset global; // unused for now
306
+ common_preset global;
297
307
  auto remote_presets = ctx.load_from_ini(preset_path, global);
298
- if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
299
- common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
308
+ remote_presets = ctx.cascade(global, remote_presets);
309
+ if (remote_presets.find(hf_tag) != remote_presets.end()) {
310
+ common_preset preset = remote_presets.at(hf_tag);
300
311
  LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
301
312
  preset.apply_to_params(params);
302
313
  } else {
303
- throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
314
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
304
315
  }
305
316
  } else {
306
317
  LOG_INF("%s", "no remote preset found, skipping\n");
@@ -330,7 +341,7 @@ static handle_model_result common_params_handle_model(
330
341
  if (model.path.empty()) {
331
342
  auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
332
343
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
333
- exit(1); // built without CURL, error message already printed
344
+ exit(1); // error message already printed
334
345
  }
335
346
  model.name = model.hf_repo; // repo name with tag
336
347
  model.hf_repo = auto_detected.repo; // repo name without tag
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1030
1041
  exit(0);
1031
1042
  }
1032
1043
  ));
1044
+ add_opt(common_arg(
1045
+ {"--license"},
1046
+ "show source code license and dependencies",
1047
+ [](common_params &) {
1048
+ for (int i = 0; LICENSES[i]; ++i) {
1049
+ printf("%s\n", LICENSES[i]);
1050
+ }
1051
+ exit(0);
1052
+ }
1053
+ ));
1033
1054
  add_opt(common_arg(
1034
1055
  {"-cl", "--cache-list"},
1035
1056
  "show list of models in cache",
@@ -1274,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1274
1295
  [](common_params & params) {
1275
1296
  params.kv_unified = true;
1276
1297
  }
1277
- ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
1298
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
1278
1299
  add_opt(common_arg(
1279
1300
  {"--context-shift"},
1280
1301
  {"--no-context-shift"},
@@ -2856,10 +2877,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2856
2877
  params.n_threads_http = value;
2857
2878
  }
2858
2879
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2880
+ add_opt(common_arg(
2881
+ {"--cache-prompt"},
2882
+ {"--no-cache-prompt"},
2883
+ string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
2884
+ [](common_params & params, bool value) {
2885
+ params.cache_prompt = value;
2886
+ }
2887
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
2859
2888
  add_opt(common_arg(
2860
2889
  {"--cache-reuse"}, "N",
2861
2890
  string_format(
2862
- "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2891
+ "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
2863
2892
  "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2864
2893
  ),
2865
2894
  [](common_params & params, int value) {
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1403
1403
  builder.add_content(builder.consume_rest());
1404
1404
  }
1405
1405
 
1406
+ static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
1407
+ // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
1408
+ // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
1409
+ static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
1410
+
1411
+ if (!builder.syntax().parse_tool_calls) {
1412
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1413
+ builder.add_content(builder.consume_rest());
1414
+ return;
1415
+ }
1416
+
1417
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1418
+
1419
+ // Find all <tool_call></tool_call> blocks
1420
+ while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
1421
+ builder.move_to(first->groups[0].end);
1422
+ builder.consume_spaces();
1423
+
1424
+ builder.try_consume_literal("```json");
1425
+ builder.try_consume_literal("```");
1426
+ builder.consume_spaces();
1427
+
1428
+ // Consume JSON object
1429
+ auto data = builder.consume_json();
1430
+
1431
+ builder.consume_spaces();
1432
+ builder.try_consume_literal("```");
1433
+ builder.consume_spaces();
1434
+
1435
+ if (!builder.try_consume_literal("</tool_call>")) {
1436
+ throw common_chat_msg_partial_exception("incomplete tool call");
1437
+ }
1438
+ builder.consume_spaces();
1439
+
1440
+ // Extract name and arguments
1441
+ std::string name;
1442
+ std::string id;
1443
+ nlohmann::ordered_json arguments;
1444
+
1445
+ const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
1446
+ if (!obj.contains("name") || !obj.contains("arguments")) {
1447
+ return false;
1448
+ }
1449
+ name = obj.at("name").get<std::string>();
1450
+ arguments = obj.at("arguments");
1451
+ if (obj.contains("id") && obj.at("id").is_string()) {
1452
+ id = obj.at("id").get<std::string>();
1453
+ }
1454
+ return true;
1455
+ };
1456
+
1457
+ if (!extract_args(data.json)) {
1458
+ if (data.json.contains("function") && data.json.at("function").is_object()) {
1459
+ auto fn = data.json.at("function");
1460
+ extract_args(fn);
1461
+ if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
1462
+ id = data.json.at("id").get<std::string>();
1463
+ }
1464
+ }
1465
+ }
1466
+
1467
+ // If name is empty, treat the JSON object as content
1468
+ if (name.empty()) {
1469
+ LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
1470
+ builder.add_content(data.json.dump());
1471
+ continue;
1472
+ }
1473
+
1474
+ std::string args_str = arguments.dump();
1475
+ if (!builder.add_tool_call(name, id, args_str)) {
1476
+ throw common_chat_msg_partial_exception("incomplete tool call");
1477
+ }
1478
+ }
1479
+
1480
+ builder.add_content(builder.consume_rest());
1481
+ }
1482
+
1483
+ static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
1484
+ LOG_DBG("%s: parsing exaone_moe\n", __func__);
1485
+ // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1486
+ // First try to parse using the standard reasoning parsing method
1487
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1488
+
1489
+ auto start_pos = builder.pos();
1490
+ auto found_end_think = builder.try_find_literal("</think>");
1491
+ builder.move_to(start_pos);
1492
+
1493
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1494
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1495
+ common_chat_parse_exaone_moe_content(builder);
1496
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1497
+ // If reasoning was parsed successfully, the remaining content is regular content
1498
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1499
+ common_chat_parse_exaone_moe_content(builder);
1500
+ } else {
1501
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1502
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1503
+ common_chat_parse_exaone_moe_content(builder);
1504
+ return;
1505
+ }
1506
+ // If no reasoning tags found, check if we should treat everything as reasoning
1507
+ if (builder.syntax().thinking_forced_open) {
1508
+ // If thinking is forced open but no tags found, treat everything as reasoning
1509
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1510
+ builder.add_reasoning_content(builder.consume_rest());
1511
+ } else {
1512
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1513
+ common_chat_parse_exaone_moe_content(builder);
1514
+ }
1515
+ }
1516
+ }
1517
+
1406
1518
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1407
1519
  builder.try_parse_reasoning("<think>", "</think>");
1408
1520
  builder.add_content(builder.consume_rest());
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1490
1602
  case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1491
1603
  common_chat_parse_solar_open(builder);
1492
1604
  break;
1605
+ case COMMON_CHAT_FORMAT_EXAONE_MOE:
1606
+ common_chat_parse_exaone_moe(builder);
1607
+ break;
1493
1608
  default:
1494
1609
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1495
1610
  }
@@ -657,6 +657,7 @@ const char * common_chat_format_name(common_chat_format format) {
657
657
  case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
658
658
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
659
659
  case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
660
+ case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
660
661
  case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
661
662
  case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
662
663
  case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2526,6 +2527,65 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
2526
2527
  return data;
2527
2528
  }
2528
2529
 
2530
+ static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
2531
+ common_chat_params data;
2532
+
2533
+ data.prompt = apply(tmpl, inputs);
2534
+ data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
2535
+ if (string_ends_with(data.prompt, "<think>\n")) {
2536
+ if (!inputs.enable_thinking) {
2537
+ data.prompt += "</think>\n\n";
2538
+ } else {
2539
+ data.thinking_forced_open = true;
2540
+ }
2541
+ }
2542
+
2543
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
2544
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
2545
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
2546
+ std::vector<std::string> tool_rules;
2547
+ foreach_function(inputs.tools, [&](const json & tool) {
2548
+ const auto & function = tool.at("function");
2549
+ std::string name = function.at("name");
2550
+ auto parameters = function.at("parameters");
2551
+ builder.resolve_refs(parameters);
2552
+ // Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
2553
+ tool_rules.push_back(builder.add_rule(
2554
+ name + "-call",
2555
+ "\"<tool_call>\" space " +
2556
+ builder.add_schema(name + "-obj", json{
2557
+ {"type", "object"},
2558
+ {"properties", {
2559
+ {"name", json{{"const", name}}},
2560
+ {"arguments", parameters},
2561
+ }},
2562
+ {"required", json::array({"name", "arguments"})},
2563
+ }) +
2564
+ " space \"</tool_call>\" space"));
2565
+ });
2566
+
2567
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
2568
+ builder.add_rule("root",
2569
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
2570
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
2571
+
2572
+ data.grammar_triggers.push_back({
2573
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
2574
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
2575
+ "(<tool_call>)[\\s\\S]*"
2576
+ });
2577
+ data.preserved_tokens = {
2578
+ "<think>",
2579
+ "</think>",
2580
+ "<tool_call>",
2581
+ "</tool_call>",
2582
+ };
2583
+ });
2584
+ }
2585
+
2586
+ return data;
2587
+ }
2588
+
2529
2589
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2530
2590
  common_chat_params data;
2531
2591
  data.prompt = apply(tmpl, inputs);
@@ -2696,6 +2756,13 @@ static common_chat_params common_chat_templates_apply_jinja(
2696
2756
  return common_chat_params_init_xiaomi_mimo(tmpl, params);
2697
2757
  }
2698
2758
 
2759
+ // EXAONE MoE format detection
2760
+ if (src.find("<tool_call>") != std::string::npos &&
2761
+ src.find("<tool_result>") != std::string::npos &&
2762
+ src.find("<|tool_declare|>") != std::string::npos) {
2763
+ return common_chat_params_init_exaone_moe(tmpl, params);
2764
+ }
2765
+
2699
2766
  // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
2700
2767
  if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
2701
2768
  return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -136,6 +136,7 @@ enum common_chat_format {
136
136
  COMMON_CHAT_FORMAT_APRIEL_1_5,
137
137
  COMMON_CHAT_FORMAT_XIAOMI_MIMO,
138
138
  COMMON_CHAT_FORMAT_SOLAR_OPEN,
139
+ COMMON_CHAT_FORMAT_EXAONE_MOE,
139
140
 
140
141
  // These are intended to be parsed by the PEG parser
141
142
  COMMON_CHAT_FORMAT_PEG_SIMPLE,
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
80
80
  //
81
81
 
82
82
  enum llama_example {
83
+ LLAMA_EXAMPLE_BATCHED,
83
84
  LLAMA_EXAMPLE_DEBUG,
84
85
  LLAMA_EXAMPLE_COMMON,
85
86
  LLAMA_EXAMPLE_SPECULATIVE,
@@ -476,6 +477,7 @@ struct common_params {
476
477
  int32_t timeout_write = timeout_read; // http write timeout in seconds
477
478
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
478
479
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
480
+ bool cache_prompt = true; // whether to enable prompt caching
479
481
  int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
480
482
  int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
481
483