@fugood/llama.node 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +405 -221
- package/src/llama.cpp/common/chat.cpp +27 -15
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +7 -6
- package/src/llama.cpp/ggml/CMakeLists.txt +37 -22
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +4 -4
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +74 -852
- package/src/llama.cpp/src/llama-arch.cpp +43 -10
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-cparams.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.h +13 -3
- package/src/llama.cpp/src/llama-model.cpp +328 -44
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +3 -1
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -1727,10 +1727,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
|
1727
1727
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1728
1728
|
LOG_DBG("%s\n", __func__);
|
|
1729
1729
|
common_chat_params data;
|
|
1730
|
-
|
|
1730
|
+
const std::optional<json> tools_override = json();
|
|
1731
|
+
const std::optional<json> additional_context = json {
|
|
1731
1732
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1732
1733
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1733
|
-
}
|
|
1734
|
+
};
|
|
1735
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
|
|
1734
1736
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1735
1737
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1736
1738
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
@@ -2216,15 +2218,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
|
|
2216
2218
|
|
|
2217
2219
|
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
2218
2220
|
// Parse thinking tags
|
|
2221
|
+
static const common_regex start_think_regex(regex_escape("<think>"));
|
|
2222
|
+
static const common_regex end_think_regex(regex_escape("</think>"));
|
|
2223
|
+
// Granite models output partial tokens such as "<" and "<think".
|
|
2224
|
+
// By leveraging try_consume_regex()/try_find_regex() throwing
|
|
2225
|
+
// common_chat_msg_partial_exception for these partial tokens,
|
|
2226
|
+
// processing is interrupted and the tokens are not passed to add_content().
|
|
2227
|
+
if (auto res = builder.try_consume_regex(start_think_regex)) {
|
|
2228
|
+
// Restore position for try_parse_reasoning()
|
|
2229
|
+
builder.move_to(res->groups[0].begin);
|
|
2230
|
+
builder.try_find_regex(end_think_regex, std::string::npos, false);
|
|
2231
|
+
// Restore position for try_parse_reasoning()
|
|
2232
|
+
builder.move_to(res->groups[0].begin);
|
|
2233
|
+
}
|
|
2219
2234
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
2220
2235
|
|
|
2221
|
-
// Parse response tags
|
|
2222
|
-
static const common_regex
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
builder.
|
|
2236
|
+
// Parse response tags
|
|
2237
|
+
static const common_regex start_response_regex(regex_escape("<response>"));
|
|
2238
|
+
static const common_regex end_response_regex(regex_escape("</response>"));
|
|
2239
|
+
// Granite models output partial tokens such as "<" and "<response".
|
|
2240
|
+
// Same hack as reasoning parsing.
|
|
2241
|
+
if (builder.try_consume_regex(start_response_regex)) {
|
|
2242
|
+
builder.try_find_regex(end_response_regex);
|
|
2228
2243
|
}
|
|
2229
2244
|
|
|
2230
2245
|
if (!builder.syntax().parse_tool_calls) {
|
|
@@ -2238,13 +2253,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2238
2253
|
builder.move_to(res->groups[0].end);
|
|
2239
2254
|
|
|
2240
2255
|
// Expect JSON array of tool calls
|
|
2241
|
-
auto
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2256
|
+
if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
|
|
2257
|
+
if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
|
|
2258
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
2245
2259
|
}
|
|
2246
|
-
} else {
|
|
2247
|
-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2248
2260
|
}
|
|
2249
2261
|
} else {
|
|
2250
2262
|
builder.add_content(builder.consume_rest());
|
|
@@ -193,10 +193,11 @@ struct common_params_sampling {
|
|
|
193
193
|
};
|
|
194
194
|
|
|
195
195
|
struct common_params_model {
|
|
196
|
-
std::string path
|
|
197
|
-
std::string url
|
|
198
|
-
std::string hf_repo
|
|
199
|
-
std::string hf_file
|
|
196
|
+
std::string path = ""; // model local path // NOLINT
|
|
197
|
+
std::string url = ""; // model url to download // NOLINT
|
|
198
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
199
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
200
|
+
std::string docker_repo = ""; // Docker repo // NOLINT
|
|
200
201
|
};
|
|
201
202
|
|
|
202
203
|
struct common_params_speculative {
|
|
@@ -288,9 +289,9 @@ struct common_params {
|
|
|
288
289
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
289
290
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
290
291
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
|
291
|
-
float yarn_attn_factor =
|
|
292
|
-
float yarn_beta_fast =
|
|
293
|
-
float yarn_beta_slow =
|
|
292
|
+
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
|
293
|
+
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
|
294
|
+
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
|
294
295
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
295
296
|
|
|
296
297
|
// offload params
|
|
@@ -453,7 +454,7 @@ struct common_params {
|
|
|
453
454
|
|
|
454
455
|
std::string slot_save_path;
|
|
455
456
|
|
|
456
|
-
float slot_prompt_similarity = 0.
|
|
457
|
+
float slot_prompt_similarity = 0.1f;
|
|
457
458
|
|
|
458
459
|
// batched-bench params
|
|
459
460
|
bool is_pp_shared = false;
|
|
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
734
735
|
|
|
735
736
|
}
|
|
736
737
|
|
|
738
|
+
//
|
|
739
|
+
// MoE utils
|
|
740
|
+
//
|
|
741
|
+
|
|
742
|
+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
|
743
|
+
|
|
744
|
+
static std::string llm_ffn_exps_block_regex(int idx) {
|
|
745
|
+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
|
749
|
+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
|
750
|
+
}
|
|
751
|
+
|
|
737
752
|
//
|
|
738
753
|
// training utils
|
|
739
754
|
//
|
|
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
|
|
257
257
|
};
|
|
258
258
|
|
|
259
259
|
static bool is_reserved_name(const std::string & name) {
|
|
260
|
-
static std::unordered_set<std::string> RESERVED_NAMES
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
for (const auto &p : PRIMITIVE_RULES)
|
|
264
|
-
for (const auto &p : STRING_FORMAT_RULES)
|
|
265
|
-
|
|
260
|
+
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
|
261
|
+
std::unordered_set<std::string> s;
|
|
262
|
+
s.insert("root");
|
|
263
|
+
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
|
264
|
+
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
|
265
|
+
return s;
|
|
266
|
+
}();
|
|
266
267
|
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
|
267
268
|
}
|
|
268
269
|
|
|
@@ -1,5 +1,41 @@
|
|
|
1
1
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
|
2
2
|
project("ggml" C CXX ASM)
|
|
3
|
+
|
|
4
|
+
### GGML Version
|
|
5
|
+
set(GGML_VERSION_MAJOR 0)
|
|
6
|
+
set(GGML_VERSION_MINOR 9)
|
|
7
|
+
set(GGML_VERSION_PATCH 0)
|
|
8
|
+
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
|
|
9
|
+
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
|
10
|
+
|
|
11
|
+
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
12
|
+
if(GIT_EXE)
|
|
13
|
+
# Get current git commit hash
|
|
14
|
+
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
15
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
16
|
+
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
|
17
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
18
|
+
ERROR_QUIET
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Check if the working directory is dirty (i.e., has uncommitted changes)
|
|
22
|
+
execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
|
|
23
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
24
|
+
RESULT_VARIABLE GGML_GIT_DIRTY
|
|
25
|
+
ERROR_QUIET
|
|
26
|
+
)
|
|
27
|
+
endif()
|
|
28
|
+
|
|
29
|
+
# Build the version string with optional -dev suffix and dirty flag
|
|
30
|
+
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
|
|
31
|
+
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
|
|
32
|
+
set(GGML_VERSION "${GGML_VERSION}-dirty")
|
|
33
|
+
endif()
|
|
34
|
+
|
|
35
|
+
if(NOT GGML_BUILD_COMMIT)
|
|
36
|
+
set(GGML_BUILD_COMMIT "unknown")
|
|
37
|
+
endif()
|
|
38
|
+
|
|
3
39
|
include(CheckIncludeFileCXX)
|
|
4
40
|
|
|
5
41
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
@@ -190,7 +226,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
|
|
190
226
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
191
227
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
192
228
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
193
|
-
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
194
229
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
195
230
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
196
231
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
@@ -301,26 +336,6 @@ endif()
|
|
|
301
336
|
# Create CMake package
|
|
302
337
|
#
|
|
303
338
|
|
|
304
|
-
# Generate version info based on git commit.
|
|
305
|
-
|
|
306
|
-
if(NOT DEFINED GGML_BUILD_NUMBER)
|
|
307
|
-
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
|
|
308
|
-
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
|
|
309
|
-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
310
|
-
OUTPUT_VARIABLE GGML_BUILD_NUMBER
|
|
311
|
-
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
if(GGML_BUILD_NUMBER EQUAL 1)
|
|
315
|
-
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
|
|
316
|
-
endif()
|
|
317
|
-
|
|
318
|
-
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
319
|
-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
320
|
-
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
|
321
|
-
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
322
|
-
)
|
|
323
|
-
endif()
|
|
324
339
|
|
|
325
340
|
|
|
326
341
|
# Capture variables prefixed with GGML_.
|
|
@@ -349,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
|
|
|
349
364
|
|
|
350
365
|
# Create the CMake package and set install location.
|
|
351
366
|
|
|
352
|
-
set(GGML_INSTALL_VERSION
|
|
367
|
+
set(GGML_INSTALL_VERSION ${GGML_VERSION})
|
|
353
368
|
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
|
354
369
|
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
355
370
|
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
|
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
|
|
|
284
284
|
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
|
285
285
|
//
|
|
286
286
|
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
|
287
|
-
const type prefix##0 = (pointer)->array[0]; \
|
|
287
|
+
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
|
|
288
288
|
GGML_UNUSED(prefix##0);
|
|
289
289
|
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
|
290
290
|
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
|
291
|
-
const type prefix##1 = (pointer)->array[1]; \
|
|
291
|
+
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
|
|
292
292
|
GGML_UNUSED(prefix##1);
|
|
293
293
|
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
|
294
294
|
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
|
295
|
-
const type prefix##2 = (pointer)->array[2]; \
|
|
295
|
+
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
|
|
296
296
|
GGML_UNUSED(prefix##2);
|
|
297
297
|
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
|
298
298
|
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
|
299
|
-
const type prefix##3 = (pointer)->array[3]; \
|
|
299
|
+
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
|
|
300
300
|
GGML_UNUSED(prefix##3);
|
|
301
301
|
|
|
302
302
|
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
|
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
|
|
114
114
|
|
|
115
115
|
if (NOT MSVC)
|
|
116
116
|
if (GGML_STATIC)
|
|
117
|
+
if (UNIX AND NOT APPLE)
|
|
118
|
+
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
|
|
119
|
+
endif()
|
|
117
120
|
add_link_options(-static)
|
|
118
121
|
if (MINGW)
|
|
119
122
|
add_link_options(-static-libgcc -static-libstdc++)
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "traits.h"
|
|
9
9
|
|
|
10
|
-
#if defined(
|
|
10
|
+
#if defined(__linux__)
|
|
11
11
|
#include <sys/syscall.h>
|
|
12
12
|
#include <unistd.h>
|
|
13
13
|
#endif
|
|
@@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
|
|
186
186
|
#define XFEATURE_XTILEDATA 18
|
|
187
187
|
|
|
188
188
|
static bool ggml_amx_init() {
|
|
189
|
-
#if defined(
|
|
189
|
+
#if defined(__linux__)
|
|
190
190
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
|
191
191
|
fprintf(stderr, "AMX is not ready to be used!\n");
|
|
192
192
|
return false;
|
|
@@ -194,6 +194,8 @@ static bool ggml_amx_init() {
|
|
|
194
194
|
return true;
|
|
195
195
|
#elif defined(_WIN32)
|
|
196
196
|
return true;
|
|
197
|
+
#else
|
|
198
|
+
return false;
|
|
197
199
|
#endif
|
|
198
200
|
}
|
|
199
201
|
|
|
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
|
|
|
28
28
|
return GGML_BF16_TO_FP32(x);
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
static inline float i32_to_f32(int32_t x) {
|
|
32
|
+
return x;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static inline int32_t f32_to_i32(float x) {
|
|
36
|
+
return x;
|
|
37
|
+
}
|
|
38
|
+
|
|
31
39
|
static inline float f32_to_f32(float x) {
|
|
32
40
|
return x;
|
|
33
41
|
}
|
|
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
|
|
|
54
62
|
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
|
55
63
|
};
|
|
56
64
|
|
|
65
|
+
template <>
|
|
66
|
+
struct type_conversion_table<int32_t> {
|
|
67
|
+
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
|
68
|
+
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
|
69
|
+
};
|
|
70
|
+
|
|
57
71
|
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
|
58
72
|
const int64_t ith = params->ith;
|
|
59
73
|
const int64_t nth = params->nth;
|
|
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
|
190
190
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
191
191
|
/* .event_record = */ NULL,
|
|
192
192
|
/* .event_wait = */ NULL,
|
|
193
|
-
/* .
|
|
193
|
+
/* .graph_optimize = */ NULL,
|
|
194
194
|
};
|
|
195
195
|
|
|
196
196
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|