@fugood/llama.node 1.4.14 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +12 -1
- package/lib/index.ts +6 -1
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +8 -11
- package/src/LlamaContext.cpp +11 -2
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +39 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +8 -5
- package/src/llama.cpp/src/llama-model.cpp +210 -96
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
package/lib/binding.ts
CHANGED
|
@@ -515,9 +515,20 @@ export interface LlamaContext {
|
|
|
515
515
|
/**
|
|
516
516
|
* Initialize multimodal support with a mmproj file
|
|
517
517
|
* @param options Object containing path and optional use_gpu flag
|
|
518
|
+
* @param options.path Path to the multimodal projector model file (mmproj)
|
|
519
|
+
* @param options.use_gpu Whether to use GPU for multimodal processing (default: true)
|
|
520
|
+
* @param options.image_min_tokens Minimum number of tokens for image input (for dynamic resolution models)
|
|
521
|
+
* @param options.image_max_tokens Maximum number of tokens for image input (for dynamic resolution models).
|
|
522
|
+
* Lower values reduce memory usage and improve speed for high-resolution images.
|
|
523
|
+
* Recommended: 256-512 for faster inference, up to 4096 for maximum detail.
|
|
518
524
|
* @returns boolean indicating if initialization was successful
|
|
519
525
|
*/
|
|
520
|
-
initMultimodal(options: {
|
|
526
|
+
initMultimodal(options: {
|
|
527
|
+
path: string
|
|
528
|
+
use_gpu?: boolean
|
|
529
|
+
image_min_tokens?: number
|
|
530
|
+
image_max_tokens?: number
|
|
531
|
+
}): boolean
|
|
521
532
|
|
|
522
533
|
/**
|
|
523
534
|
* Check if multimodal support is enabled
|
package/lib/index.ts
CHANGED
|
@@ -254,7 +254,12 @@ class LlamaContextWrapper {
|
|
|
254
254
|
return this.ctx.getLoadedLoraAdapters()
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
-
initMultimodal(options: {
|
|
257
|
+
initMultimodal(options: {
|
|
258
|
+
path: string
|
|
259
|
+
use_gpu?: boolean
|
|
260
|
+
image_min_tokens?: number
|
|
261
|
+
image_max_tokens?: number
|
|
262
|
+
}): boolean {
|
|
258
263
|
return this.ctx.initMultimodal(options)
|
|
259
264
|
}
|
|
260
265
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.15",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.15",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.15",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.15",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.15",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.15",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.15",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.15",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.15",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.15",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.15",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.15",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.15",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.15",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.15"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index 723973ed7..e4b2c6537 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -146,4 +146,11 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
@@ -13,11 +13,8 @@ index f7b99159e..fa37fed19 100644
|
|
|
13
13
|
+else()
|
|
14
14
|
+ set(LLAMA_COMMON_WIN_LIBS "")
|
|
15
15
|
+endif()
|
|
16
|
-
|
|
16
|
+
+
|
|
17
17
|
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
18
|
-
|
|
19
|
-
#
|
|
20
|
-
# copy the license files
|
|
21
18
|
diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
|
|
22
19
|
index 1bcba9cd8..b7cd68734 100644
|
|
23
20
|
--- a/src/llama.cpp/common/chat-peg-parser.cpp
|
|
@@ -32,7 +29,7 @@ index 1bcba9cd8..b7cd68734 100644
|
|
|
32
29
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
|
33
30
|
int count = 0;
|
|
34
31
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
35
|
-
index
|
|
32
|
+
index d531388bc..e6712b368 100644
|
|
36
33
|
--- a/src/llama.cpp/common/chat.cpp
|
|
37
34
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
38
35
|
@@ -7,9 +7,6 @@
|
|
@@ -62,7 +59,7 @@ index 22e527bab..c3d0affca 100644
|
|
|
62
59
|
struct templates_params {
|
|
63
60
|
json messages;
|
|
64
61
|
json tools;
|
|
65
|
-
@@ -
|
|
62
|
+
@@ -753,7 +740,7 @@ static std::string apply(
|
|
66
63
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
67
64
|
}
|
|
68
65
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -72,7 +69,7 @@ index 22e527bab..c3d0affca 100644
|
|
|
72
69
|
minja::chat_template_options tmpl_opts;
|
|
73
70
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
74
71
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
75
|
-
index
|
|
72
|
+
index 454085e90..e01390cf9 100644
|
|
76
73
|
--- a/src/llama.cpp/common/chat.h
|
|
77
74
|
+++ b/src/llama.cpp/common/chat.h
|
|
78
75
|
@@ -10,7 +10,18 @@
|
|
@@ -108,10 +105,10 @@ index 744f0b4ee..04fcebb9e 100644
|
|
|
108
105
|
mparams.main_gpu = params.main_gpu;
|
|
109
106
|
mparams.split_mode = params.split_mode;
|
|
110
107
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
108
|
+
index e60087dea..c21797cd8 100644
|
|
112
109
|
--- a/src/llama.cpp/common/common.h
|
|
113
110
|
+++ b/src/llama.cpp/common/common.h
|
|
114
|
-
@@ -
|
|
111
|
+
@@ -311,6 +311,7 @@ struct lr_opt {
|
|
115
112
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
116
113
|
|
|
117
114
|
struct common_params {
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -1333,7 +1333,7 @@ extern "C" void cleanup_logging() {
|
|
|
1333
1333
|
}
|
|
1334
1334
|
|
|
1335
1335
|
|
|
1336
|
-
// initMultimodal(options: { path: string, use_gpu?: boolean }): boolean
|
|
1336
|
+
// initMultimodal(options: { path: string, use_gpu?: boolean, image_min_tokens?: number, image_max_tokens?: number }): boolean
|
|
1337
1337
|
Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
|
|
1338
1338
|
Napi::Env env = info.Env();
|
|
1339
1339
|
|
|
@@ -1345,6 +1345,15 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
|
|
|
1345
1345
|
auto options = info[0].As<Napi::Object>();
|
|
1346
1346
|
auto mmproj_path = options.Get("path").ToString().Utf8Value();
|
|
1347
1347
|
auto use_gpu = options.Get("use_gpu").ToBoolean().Value();
|
|
1348
|
+
int image_min_tokens = -1;
|
|
1349
|
+
int image_max_tokens = -1;
|
|
1350
|
+
|
|
1351
|
+
if (options.Has("image_min_tokens") && options.Get("image_min_tokens").IsNumber()) {
|
|
1352
|
+
image_min_tokens = options.Get("image_min_tokens").ToNumber().Int32Value();
|
|
1353
|
+
}
|
|
1354
|
+
if (options.Has("image_max_tokens") && options.Get("image_max_tokens").IsNumber()) {
|
|
1355
|
+
image_max_tokens = options.Get("image_max_tokens").ToNumber().Int32Value();
|
|
1356
|
+
}
|
|
1348
1357
|
|
|
1349
1358
|
if (mmproj_path.empty()) {
|
|
1350
1359
|
Napi::TypeError::New(env, "mmproj path is required")
|
|
@@ -1360,7 +1369,7 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
|
|
|
1360
1369
|
|
|
1361
1370
|
// Disable ctx_shift before initializing multimodal
|
|
1362
1371
|
_rn_ctx->params.ctx_shift = false;
|
|
1363
|
-
bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu);
|
|
1372
|
+
bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu, image_min_tokens, image_max_tokens);
|
|
1364
1373
|
if (!result) {
|
|
1365
1374
|
Napi::Error::New(env, "Failed to initialize multimodal context")
|
|
1366
1375
|
.ThrowAsJavaScriptException();
|
|
@@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
|
|
111
111
|
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
|
112
112
|
|
|
113
113
|
# 3rd party libs
|
|
114
|
-
option(
|
|
115
|
-
option(
|
|
116
|
-
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
|
|
114
|
+
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
|
|
115
|
+
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
|
117
116
|
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
|
118
117
|
|
|
118
|
+
# deprecated
|
|
119
|
+
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
|
120
|
+
if (LLAMA_CURL)
|
|
121
|
+
message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
|
|
122
|
+
endif()
|
|
123
|
+
|
|
119
124
|
# Required for relocatable CMake package
|
|
120
125
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
|
121
126
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
|
@@ -182,6 +187,9 @@ if (NOT MSVC)
|
|
|
182
187
|
endif()
|
|
183
188
|
endif()
|
|
184
189
|
|
|
190
|
+
include("cmake/license.cmake")
|
|
191
|
+
license_add_file("llama.cpp" "LICENSE")
|
|
192
|
+
|
|
185
193
|
#
|
|
186
194
|
# 3rd-party
|
|
187
195
|
#
|
|
@@ -209,11 +217,6 @@ add_subdirectory(src)
|
|
|
209
217
|
# utils, programs, examples and tests
|
|
210
218
|
#
|
|
211
219
|
|
|
212
|
-
if (NOT LLAMA_BUILD_COMMON)
|
|
213
|
-
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
|
|
214
|
-
set(LLAMA_CURL OFF)
|
|
215
|
-
endif()
|
|
216
|
-
|
|
217
220
|
if (LLAMA_BUILD_COMMON)
|
|
218
221
|
add_subdirectory(common)
|
|
219
222
|
if (LLAMA_HTTPLIB)
|
|
@@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
|
|
235
238
|
add_subdirectory(tools)
|
|
236
239
|
endif()
|
|
237
240
|
|
|
241
|
+
# Automatically add all files from the 'licenses' directory
|
|
242
|
+
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
|
243
|
+
|
|
244
|
+
foreach(FILE_PATH ${EXTRA_LICENSES})
|
|
245
|
+
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
|
|
246
|
+
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
|
|
247
|
+
license_add_file("${NAME}" "${FILE_PATH}")
|
|
248
|
+
endforeach()
|
|
249
|
+
|
|
250
|
+
if (LLAMA_BUILD_COMMON)
|
|
251
|
+
license_generate(common)
|
|
252
|
+
endif()
|
|
253
|
+
|
|
238
254
|
#
|
|
239
255
|
# install
|
|
240
256
|
#
|
|
@@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
|
|
|
60
60
|
common.h
|
|
61
61
|
console.cpp
|
|
62
62
|
console.h
|
|
63
|
+
debug.cpp
|
|
64
|
+
debug.h
|
|
63
65
|
download.cpp
|
|
64
66
|
download.h
|
|
65
67
|
http.h
|
|
@@ -95,17 +97,7 @@ endif()
|
|
|
95
97
|
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
|
|
96
98
|
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
|
97
99
|
|
|
98
|
-
if (
|
|
99
|
-
# Use curl to download model url
|
|
100
|
-
find_package(CURL)
|
|
101
|
-
if (NOT CURL_FOUND)
|
|
102
|
-
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
|
|
103
|
-
endif()
|
|
104
|
-
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
105
|
-
include_directories(${CURL_INCLUDE_DIRS})
|
|
106
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
107
|
-
elseif (LLAMA_HTTPLIB)
|
|
108
|
-
# otherwise, use cpp-httplib
|
|
100
|
+
if (LLAMA_HTTPLIB)
|
|
109
101
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
|
|
110
102
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
|
|
111
103
|
endif()
|
|
@@ -162,26 +154,3 @@ else()
|
|
|
162
154
|
endif()
|
|
163
155
|
|
|
164
156
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
|
|
165
|
-
|
|
166
|
-
#
|
|
167
|
-
# copy the license files
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
# Check if running in GitHub Actions
|
|
171
|
-
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
|
172
|
-
message(STATUS "Running inside GitHub Actions - copying license files")
|
|
173
|
-
|
|
174
|
-
# Copy all files from licenses/ to build/bin/
|
|
175
|
-
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
|
176
|
-
foreach(LICENSE_FILE ${LICENSE_FILES})
|
|
177
|
-
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
|
178
|
-
add_custom_command(
|
|
179
|
-
POST_BUILD
|
|
180
|
-
TARGET ${TARGET}
|
|
181
|
-
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
|
182
|
-
"${LICENSE_FILE}"
|
|
183
|
-
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
|
184
|
-
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
|
185
|
-
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
|
186
|
-
endforeach()
|
|
187
|
-
endif()
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
+
#include "download.h"
|
|
5
6
|
#include "json-schema-to-grammar.h"
|
|
6
7
|
#include "log.h"
|
|
7
8
|
#include "sampling.h"
|
|
8
|
-
#include "download.h"
|
|
9
9
|
#include "preset.h"
|
|
10
10
|
|
|
11
11
|
// fix problem with std::min and std::max
|
|
@@ -48,6 +48,8 @@
|
|
|
48
48
|
|
|
49
49
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
50
50
|
|
|
51
|
+
extern const char * LICENSES[];
|
|
52
|
+
|
|
51
53
|
using json = nlohmann::ordered_json;
|
|
52
54
|
using namespace common_arg_utils;
|
|
53
55
|
|
|
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
|
|
|
279
281
|
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
280
282
|
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
281
283
|
|
|
284
|
+
// the returned hf_repo is without tag
|
|
285
|
+
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
|
|
286
|
+
|
|
287
|
+
// "latest" tag (default if not specified) is translated to "default" preset
|
|
288
|
+
if (hf_tag == "latest") {
|
|
289
|
+
hf_tag = "default";
|
|
290
|
+
}
|
|
291
|
+
|
|
282
292
|
const bool offline = params.offline;
|
|
283
293
|
std::string model_endpoint = get_model_endpoint();
|
|
284
|
-
auto preset_url = model_endpoint +
|
|
294
|
+
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
|
285
295
|
|
|
286
296
|
// prepare local path for caching
|
|
287
|
-
auto preset_fname = clean_file_name(
|
|
297
|
+
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
|
|
288
298
|
auto preset_path = fs_get_cache_file(preset_fname);
|
|
289
299
|
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
|
290
300
|
const bool has_preset = status >= 200 && status < 400;
|
|
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
|
|
293
303
|
if (has_preset) {
|
|
294
304
|
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
|
295
305
|
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
296
|
-
common_preset global;
|
|
306
|
+
common_preset global;
|
|
297
307
|
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
298
|
-
|
|
299
|
-
|
|
308
|
+
remote_presets = ctx.cascade(global, remote_presets);
|
|
309
|
+
if (remote_presets.find(hf_tag) != remote_presets.end()) {
|
|
310
|
+
common_preset preset = remote_presets.at(hf_tag);
|
|
300
311
|
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
301
312
|
preset.apply_to_params(params);
|
|
302
313
|
} else {
|
|
303
|
-
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(
|
|
314
|
+
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
|
304
315
|
}
|
|
305
316
|
} else {
|
|
306
317
|
LOG_INF("%s", "no remote preset found, skipping\n");
|
|
@@ -330,7 +341,7 @@ static handle_model_result common_params_handle_model(
|
|
|
330
341
|
if (model.path.empty()) {
|
|
331
342
|
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
|
332
343
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
333
|
-
exit(1); //
|
|
344
|
+
exit(1); // error message already printed
|
|
334
345
|
}
|
|
335
346
|
model.name = model.hf_repo; // repo name with tag
|
|
336
347
|
model.hf_repo = auto_detected.repo; // repo name without tag
|
|
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1030
1041
|
exit(0);
|
|
1031
1042
|
}
|
|
1032
1043
|
));
|
|
1044
|
+
add_opt(common_arg(
|
|
1045
|
+
{"--license"},
|
|
1046
|
+
"show source code license and dependencies",
|
|
1047
|
+
[](common_params &) {
|
|
1048
|
+
for (int i = 0; LICENSES[i]; ++i) {
|
|
1049
|
+
printf("%s\n", LICENSES[i]);
|
|
1050
|
+
}
|
|
1051
|
+
exit(0);
|
|
1052
|
+
}
|
|
1053
|
+
));
|
|
1033
1054
|
add_opt(common_arg(
|
|
1034
1055
|
{"-cl", "--cache-list"},
|
|
1035
1056
|
"show list of models in cache",
|
|
@@ -1274,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1274
1295
|
[](common_params & params) {
|
|
1275
1296
|
params.kv_unified = true;
|
|
1276
1297
|
}
|
|
1277
|
-
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1298
|
+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
|
|
1278
1299
|
add_opt(common_arg(
|
|
1279
1300
|
{"--context-shift"},
|
|
1280
1301
|
{"--no-context-shift"},
|
|
@@ -2856,10 +2877,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2856
2877
|
params.n_threads_http = value;
|
|
2857
2878
|
}
|
|
2858
2879
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
2880
|
+
add_opt(common_arg(
|
|
2881
|
+
{"--cache-prompt"},
|
|
2882
|
+
{"--no-cache-prompt"},
|
|
2883
|
+
string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
|
|
2884
|
+
[](common_params & params, bool value) {
|
|
2885
|
+
params.cache_prompt = value;
|
|
2886
|
+
}
|
|
2887
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
|
|
2859
2888
|
add_opt(common_arg(
|
|
2860
2889
|
{"--cache-reuse"}, "N",
|
|
2861
2890
|
string_format(
|
|
2862
|
-
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
|
2891
|
+
"min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
|
|
2863
2892
|
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
|
2864
2893
|
),
|
|
2865
2894
|
[](common_params & params, int value) {
|
|
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
|
|
|
1403
1403
|
builder.add_content(builder.consume_rest());
|
|
1404
1404
|
}
|
|
1405
1405
|
|
|
1406
|
+
static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
|
|
1407
|
+
// 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
|
|
1408
|
+
// 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
|
|
1409
|
+
static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
|
|
1410
|
+
|
|
1411
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1412
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1413
|
+
builder.add_content(builder.consume_rest());
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1418
|
+
|
|
1419
|
+
// Find all <tool_call></tool_call> blocks
|
|
1420
|
+
while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
|
|
1421
|
+
builder.move_to(first->groups[0].end);
|
|
1422
|
+
builder.consume_spaces();
|
|
1423
|
+
|
|
1424
|
+
builder.try_consume_literal("```json");
|
|
1425
|
+
builder.try_consume_literal("```");
|
|
1426
|
+
builder.consume_spaces();
|
|
1427
|
+
|
|
1428
|
+
// Consume JSON object
|
|
1429
|
+
auto data = builder.consume_json();
|
|
1430
|
+
|
|
1431
|
+
builder.consume_spaces();
|
|
1432
|
+
builder.try_consume_literal("```");
|
|
1433
|
+
builder.consume_spaces();
|
|
1434
|
+
|
|
1435
|
+
if (!builder.try_consume_literal("</tool_call>")) {
|
|
1436
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1437
|
+
}
|
|
1438
|
+
builder.consume_spaces();
|
|
1439
|
+
|
|
1440
|
+
// Extract name and arguments
|
|
1441
|
+
std::string name;
|
|
1442
|
+
std::string id;
|
|
1443
|
+
nlohmann::ordered_json arguments;
|
|
1444
|
+
|
|
1445
|
+
const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
|
|
1446
|
+
if (!obj.contains("name") || !obj.contains("arguments")) {
|
|
1447
|
+
return false;
|
|
1448
|
+
}
|
|
1449
|
+
name = obj.at("name").get<std::string>();
|
|
1450
|
+
arguments = obj.at("arguments");
|
|
1451
|
+
if (obj.contains("id") && obj.at("id").is_string()) {
|
|
1452
|
+
id = obj.at("id").get<std::string>();
|
|
1453
|
+
}
|
|
1454
|
+
return true;
|
|
1455
|
+
};
|
|
1456
|
+
|
|
1457
|
+
if (!extract_args(data.json)) {
|
|
1458
|
+
if (data.json.contains("function") && data.json.at("function").is_object()) {
|
|
1459
|
+
auto fn = data.json.at("function");
|
|
1460
|
+
extract_args(fn);
|
|
1461
|
+
if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
|
|
1462
|
+
id = data.json.at("id").get<std::string>();
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
// If name is empty, treat the JSON object as content
|
|
1468
|
+
if (name.empty()) {
|
|
1469
|
+
LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
|
|
1470
|
+
builder.add_content(data.json.dump());
|
|
1471
|
+
continue;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
std::string args_str = arguments.dump();
|
|
1475
|
+
if (!builder.add_tool_call(name, id, args_str)) {
|
|
1476
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
builder.add_content(builder.consume_rest());
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
|
|
1484
|
+
LOG_DBG("%s: parsing exaone_moe\n", __func__);
|
|
1485
|
+
// EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1486
|
+
// First try to parse using the standard reasoning parsing method
|
|
1487
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1488
|
+
|
|
1489
|
+
auto start_pos = builder.pos();
|
|
1490
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1491
|
+
builder.move_to(start_pos);
|
|
1492
|
+
|
|
1493
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1494
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1495
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1496
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1497
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1498
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1499
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1500
|
+
} else {
|
|
1501
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1502
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1504
|
+
return;
|
|
1505
|
+
}
|
|
1506
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1507
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1508
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1509
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1510
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1511
|
+
} else {
|
|
1512
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1513
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1406
1518
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
1407
1519
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1408
1520
|
builder.add_content(builder.consume_rest());
|
|
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1490
1602
|
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
|
|
1491
1603
|
common_chat_parse_solar_open(builder);
|
|
1492
1604
|
break;
|
|
1605
|
+
case COMMON_CHAT_FORMAT_EXAONE_MOE:
|
|
1606
|
+
common_chat_parse_exaone_moe(builder);
|
|
1607
|
+
break;
|
|
1493
1608
|
default:
|
|
1494
1609
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1495
1610
|
}
|
|
@@ -657,6 +657,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
657
657
|
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
|
658
658
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
|
659
659
|
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
|
|
660
|
+
case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
|
|
660
661
|
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
|
|
661
662
|
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
|
|
662
663
|
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
|
|
@@ -2526,6 +2527,65 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
|
|
|
2526
2527
|
return data;
|
|
2527
2528
|
}
|
|
2528
2529
|
|
|
2530
|
+
static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2531
|
+
common_chat_params data;
|
|
2532
|
+
|
|
2533
|
+
data.prompt = apply(tmpl, inputs);
|
|
2534
|
+
data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
|
|
2535
|
+
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
2536
|
+
if (!inputs.enable_thinking) {
|
|
2537
|
+
data.prompt += "</think>\n\n";
|
|
2538
|
+
} else {
|
|
2539
|
+
data.thinking_forced_open = true;
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
2544
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
|
2545
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
2546
|
+
std::vector<std::string> tool_rules;
|
|
2547
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
2548
|
+
const auto & function = tool.at("function");
|
|
2549
|
+
std::string name = function.at("name");
|
|
2550
|
+
auto parameters = function.at("parameters");
|
|
2551
|
+
builder.resolve_refs(parameters);
|
|
2552
|
+
// Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
|
|
2553
|
+
tool_rules.push_back(builder.add_rule(
|
|
2554
|
+
name + "-call",
|
|
2555
|
+
"\"<tool_call>\" space " +
|
|
2556
|
+
builder.add_schema(name + "-obj", json{
|
|
2557
|
+
{"type", "object"},
|
|
2558
|
+
{"properties", {
|
|
2559
|
+
{"name", json{{"const", name}}},
|
|
2560
|
+
{"arguments", parameters},
|
|
2561
|
+
}},
|
|
2562
|
+
{"required", json::array({"name", "arguments"})},
|
|
2563
|
+
}) +
|
|
2564
|
+
" space \"</tool_call>\" space"));
|
|
2565
|
+
});
|
|
2566
|
+
|
|
2567
|
+
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
|
2568
|
+
builder.add_rule("root",
|
|
2569
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
2570
|
+
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
|
|
2571
|
+
|
|
2572
|
+
data.grammar_triggers.push_back({
|
|
2573
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
2574
|
+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
|
|
2575
|
+
"(<tool_call>)[\\s\\S]*"
|
|
2576
|
+
});
|
|
2577
|
+
data.preserved_tokens = {
|
|
2578
|
+
"<think>",
|
|
2579
|
+
"</think>",
|
|
2580
|
+
"<tool_call>",
|
|
2581
|
+
"</tool_call>",
|
|
2582
|
+
};
|
|
2583
|
+
});
|
|
2584
|
+
}
|
|
2585
|
+
|
|
2586
|
+
return data;
|
|
2587
|
+
}
|
|
2588
|
+
|
|
2529
2589
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2530
2590
|
common_chat_params data;
|
|
2531
2591
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2696,6 +2756,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2696
2756
|
return common_chat_params_init_xiaomi_mimo(tmpl, params);
|
|
2697
2757
|
}
|
|
2698
2758
|
|
|
2759
|
+
// EXAONE MoE format detection
|
|
2760
|
+
if (src.find("<tool_call>") != std::string::npos &&
|
|
2761
|
+
src.find("<tool_result>") != std::string::npos &&
|
|
2762
|
+
src.find("<|tool_declare|>") != std::string::npos) {
|
|
2763
|
+
return common_chat_params_init_exaone_moe(tmpl, params);
|
|
2764
|
+
}
|
|
2765
|
+
|
|
2699
2766
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
2700
2767
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
|
2701
2768
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
@@ -136,6 +136,7 @@ enum common_chat_format {
|
|
|
136
136
|
COMMON_CHAT_FORMAT_APRIEL_1_5,
|
|
137
137
|
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
|
138
138
|
COMMON_CHAT_FORMAT_SOLAR_OPEN,
|
|
139
|
+
COMMON_CHAT_FORMAT_EXAONE_MOE,
|
|
139
140
|
|
|
140
141
|
// These are intended to be parsed by the PEG parser
|
|
141
142
|
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
|
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
|
|
80
80
|
//
|
|
81
81
|
|
|
82
82
|
enum llama_example {
|
|
83
|
+
LLAMA_EXAMPLE_BATCHED,
|
|
83
84
|
LLAMA_EXAMPLE_DEBUG,
|
|
84
85
|
LLAMA_EXAMPLE_COMMON,
|
|
85
86
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
@@ -476,6 +477,7 @@ struct common_params {
|
|
|
476
477
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
477
478
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
478
479
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
480
|
+
bool cache_prompt = true; // whether to enable prompt caching
|
|
479
481
|
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
480
482
|
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
481
483
|
|