@fugood/llama.node 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/arg.cpp +359 -310
- package/src/llama.cpp/common/chat.cpp +27 -15
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +37 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +93 -862
- package/src/llama.cpp/include/llama.h +15 -11
- package/src/llama.cpp/src/llama-context.cpp +151 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +1 -1
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +14 -4
- package/src/llama.cpp/src/llama-model.h +5 -1
|
@@ -1727,10 +1727,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
|
1727
1727
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1728
1728
|
LOG_DBG("%s\n", __func__);
|
|
1729
1729
|
common_chat_params data;
|
|
1730
|
-
|
|
1730
|
+
const std::optional<json> tools_override = json();
|
|
1731
|
+
const std::optional<json> additional_context = json {
|
|
1731
1732
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1732
1733
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1733
|
-
}
|
|
1734
|
+
};
|
|
1735
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
|
|
1734
1736
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1735
1737
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1736
1738
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
@@ -2216,15 +2218,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
|
|
2216
2218
|
|
|
2217
2219
|
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
2218
2220
|
// Parse thinking tags
|
|
2221
|
+
static const common_regex start_think_regex(regex_escape("<think>"));
|
|
2222
|
+
static const common_regex end_think_regex(regex_escape("</think>"));
|
|
2223
|
+
// Granite models output partial tokens such as "<" and "<think".
|
|
2224
|
+
// By leveraging try_consume_regex()/try_find_regex() throwing
|
|
2225
|
+
// common_chat_msg_partial_exception for these partial tokens,
|
|
2226
|
+
// processing is interrupted and the tokens are not passed to add_content().
|
|
2227
|
+
if (auto res = builder.try_consume_regex(start_think_regex)) {
|
|
2228
|
+
// Restore position for try_parse_reasoning()
|
|
2229
|
+
builder.move_to(res->groups[0].begin);
|
|
2230
|
+
builder.try_find_regex(end_think_regex, std::string::npos, false);
|
|
2231
|
+
// Restore position for try_parse_reasoning()
|
|
2232
|
+
builder.move_to(res->groups[0].begin);
|
|
2233
|
+
}
|
|
2219
2234
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
2220
2235
|
|
|
2221
|
-
// Parse response tags
|
|
2222
|
-
static const common_regex
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
builder.
|
|
2236
|
+
// Parse response tags
|
|
2237
|
+
static const common_regex start_response_regex(regex_escape("<response>"));
|
|
2238
|
+
static const common_regex end_response_regex(regex_escape("</response>"));
|
|
2239
|
+
// Granite models output partial tokens such as "<" and "<response".
|
|
2240
|
+
// Same hack as reasoning parsing.
|
|
2241
|
+
if (builder.try_consume_regex(start_response_regex)) {
|
|
2242
|
+
builder.try_find_regex(end_response_regex);
|
|
2228
2243
|
}
|
|
2229
2244
|
|
|
2230
2245
|
if (!builder.syntax().parse_tool_calls) {
|
|
@@ -2238,13 +2253,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2238
2253
|
builder.move_to(res->groups[0].end);
|
|
2239
2254
|
|
|
2240
2255
|
// Expect JSON array of tool calls
|
|
2241
|
-
auto
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2256
|
+
if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
|
|
2257
|
+
if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
|
|
2258
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
2245
2259
|
}
|
|
2246
|
-
} else {
|
|
2247
|
-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2248
2260
|
}
|
|
2249
2261
|
} else {
|
|
2250
2262
|
builder.add_content(builder.consume_rest());
|
|
@@ -1,5 +1,41 @@
|
|
|
1
1
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
|
2
2
|
project("ggml" C CXX ASM)
|
|
3
|
+
|
|
4
|
+
### GGML Version
|
|
5
|
+
set(GGML_VERSION_MAJOR 0)
|
|
6
|
+
set(GGML_VERSION_MINOR 9)
|
|
7
|
+
set(GGML_VERSION_PATCH 0)
|
|
8
|
+
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
|
|
9
|
+
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
|
10
|
+
|
|
11
|
+
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
12
|
+
if(GIT_EXE)
|
|
13
|
+
# Get current git commit hash
|
|
14
|
+
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
15
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
16
|
+
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
|
17
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
18
|
+
ERROR_QUIET
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Check if the working directory is dirty (i.e., has uncommitted changes)
|
|
22
|
+
execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
|
|
23
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
24
|
+
RESULT_VARIABLE GGML_GIT_DIRTY
|
|
25
|
+
ERROR_QUIET
|
|
26
|
+
)
|
|
27
|
+
endif()
|
|
28
|
+
|
|
29
|
+
# Build the version string with optional -dev suffix and dirty flag
|
|
30
|
+
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
|
|
31
|
+
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
|
|
32
|
+
set(GGML_VERSION "${GGML_VERSION}-dirty")
|
|
33
|
+
endif()
|
|
34
|
+
|
|
35
|
+
if(NOT GGML_BUILD_COMMIT)
|
|
36
|
+
set(GGML_BUILD_COMMIT "unknown")
|
|
37
|
+
endif()
|
|
38
|
+
|
|
3
39
|
include(CheckIncludeFileCXX)
|
|
4
40
|
|
|
5
41
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
@@ -300,26 +336,6 @@ endif()
|
|
|
300
336
|
# Create CMake package
|
|
301
337
|
#
|
|
302
338
|
|
|
303
|
-
# Generate version info based on git commit.
|
|
304
|
-
|
|
305
|
-
if(NOT DEFINED GGML_BUILD_NUMBER)
|
|
306
|
-
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
|
|
307
|
-
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
|
|
308
|
-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
309
|
-
OUTPUT_VARIABLE GGML_BUILD_NUMBER
|
|
310
|
-
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
if(GGML_BUILD_NUMBER EQUAL 1)
|
|
314
|
-
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
|
|
315
|
-
endif()
|
|
316
|
-
|
|
317
|
-
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
318
|
-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
319
|
-
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
|
320
|
-
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
321
|
-
)
|
|
322
|
-
endif()
|
|
323
339
|
|
|
324
340
|
|
|
325
341
|
# Capture variables prefixed with GGML_.
|
|
@@ -348,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
|
|
|
348
364
|
|
|
349
365
|
# Create the CMake package and set install location.
|
|
350
366
|
|
|
351
|
-
set(GGML_INSTALL_VERSION
|
|
367
|
+
set(GGML_INSTALL_VERSION ${GGML_VERSION})
|
|
352
368
|
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
|
353
369
|
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
354
370
|
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
|
@@ -314,7 +314,8 @@ extern "C" {
|
|
|
314
314
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
315
315
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
|
316
316
|
|
|
317
|
-
GGML_API
|
|
317
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
318
|
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
318
319
|
|
|
319
320
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
320
321
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
|
|
114
114
|
|
|
115
115
|
if (NOT MSVC)
|
|
116
116
|
if (GGML_STATIC)
|
|
117
|
+
if (UNIX AND NOT APPLE)
|
|
118
|
+
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
|
|
119
|
+
endif()
|
|
117
120
|
add_link_options(-static)
|
|
118
121
|
if (MINGW)
|
|
119
122
|
add_link_options(-static-libgcc -static-libstdc++)
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "traits.h"
|
|
9
9
|
|
|
10
|
-
#if defined(
|
|
10
|
+
#if defined(__linux__)
|
|
11
11
|
#include <sys/syscall.h>
|
|
12
12
|
#include <unistd.h>
|
|
13
13
|
#endif
|
|
@@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
|
|
186
186
|
#define XFEATURE_XTILEDATA 18
|
|
187
187
|
|
|
188
188
|
static bool ggml_amx_init() {
|
|
189
|
-
#if defined(
|
|
189
|
+
#if defined(__linux__)
|
|
190
190
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
|
191
191
|
fprintf(stderr, "AMX is not ready to be used!\n");
|
|
192
192
|
return false;
|
|
@@ -194,6 +194,8 @@ static bool ggml_amx_init() {
|
|
|
194
194
|
return true;
|
|
195
195
|
#elif defined(_WIN32)
|
|
196
196
|
return true;
|
|
197
|
+
#else
|
|
198
|
+
return false;
|
|
197
199
|
#endif
|
|
198
200
|
}
|
|
199
201
|
|
|
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
878
878
|
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
|
|
879
879
|
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
|
|
880
880
|
|
|
881
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
881
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
882
882
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
883
883
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
884
884
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
1231
1231
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
|
1232
1232
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
|
1233
1233
|
|
|
1234
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
1234
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
1235
1235
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
1236
1236
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
1237
1237
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
|
|
|
28
28
|
return GGML_BF16_TO_FP32(x);
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
static inline float i32_to_f32(int32_t x) {
|
|
32
|
+
return x;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static inline int32_t f32_to_i32(float x) {
|
|
36
|
+
return x;
|
|
37
|
+
}
|
|
38
|
+
|
|
31
39
|
static inline float f32_to_f32(float x) {
|
|
32
40
|
return x;
|
|
33
41
|
}
|
|
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
|
|
|
54
62
|
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
|
55
63
|
};
|
|
56
64
|
|
|
65
|
+
template <>
|
|
66
|
+
struct type_conversion_table<int32_t> {
|
|
67
|
+
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
|
68
|
+
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
|
69
|
+
};
|
|
70
|
+
|
|
57
71
|
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
|
58
72
|
const int64_t ith = params->ith;
|
|
59
73
|
const int64_t nth = params->nth;
|
|
@@ -473,10 +473,10 @@ struct ggml_threadpool {
|
|
|
473
473
|
struct ggml_compute_state {
|
|
474
474
|
#ifndef GGML_USE_OPENMP
|
|
475
475
|
ggml_thread_t thrd;
|
|
476
|
-
bool cpumask[GGML_MAX_N_THREADS];
|
|
477
476
|
int last_graph;
|
|
478
477
|
bool pending;
|
|
479
478
|
#endif
|
|
479
|
+
bool cpumask[GGML_MAX_N_THREADS];
|
|
480
480
|
struct ggml_threadpool * threadpool;
|
|
481
481
|
int ith;
|
|
482
482
|
};
|
|
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3081
3081
|
|
|
3082
3082
|
threadpool->workers = workers;
|
|
3083
3083
|
|
|
3084
|
-
#
|
|
3084
|
+
#ifdef GGML_USE_OPENMP
|
|
3085
|
+
int32_t cpumask_iter = 0;
|
|
3086
|
+
|
|
3087
|
+
// Compute CPU masks for each thread
|
|
3088
|
+
for (int j = 0; j < tpp->n_threads; j++) {
|
|
3089
|
+
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
|
3090
|
+
}
|
|
3091
|
+
#else // GGML_USE_OPENMP
|
|
3085
3092
|
ggml_mutex_init(&threadpool->mutex);
|
|
3086
3093
|
ggml_cond_init(&threadpool->cond);
|
|
3087
3094
|
|
|
@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3154
3161
|
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3155
3162
|
}
|
|
3156
3163
|
|
|
3157
|
-
|
|
3164
|
+
// Apply thread CPU mask and priority
|
|
3165
|
+
int ith = omp_get_thread_num();
|
|
3166
|
+
|
|
3167
|
+
ggml_thread_apply_priority(threadpool->prio);
|
|
3168
|
+
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
|
3169
|
+
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
|
3170
|
+
}
|
|
3171
|
+
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3158
3172
|
}
|
|
3159
3173
|
} else {
|
|
3160
3174
|
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
|
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
|
190
190
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
191
191
|
/* .event_record = */ NULL,
|
|
192
192
|
/* .event_wait = */ NULL,
|
|
193
|
-
/* .
|
|
193
|
+
/* .graph_optimize = */ NULL,
|
|
194
194
|
};
|
|
195
195
|
|
|
196
196
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|