@fugood/llama.node 1.4.14 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -6
- package/lib/index.js +2 -2
- package/lib/index.ts +8 -3
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +77 -65
- package/src/LlamaContext.cpp +31 -34
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +15 -34
- package/src/llama.cpp/common/arg.cpp +59 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +356 -34
- package/src/llama.cpp/common/chat.h +17 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +30 -25
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +13 -6
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +215 -97
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
|
|
80
80
|
//
|
|
81
81
|
|
|
82
82
|
enum llama_example {
|
|
83
|
+
LLAMA_EXAMPLE_BATCHED,
|
|
83
84
|
LLAMA_EXAMPLE_DEBUG,
|
|
84
85
|
LLAMA_EXAMPLE_COMMON,
|
|
85
86
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
@@ -118,6 +119,7 @@ enum common_sampler_type {
|
|
|
118
119
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
119
120
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
120
121
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
|
122
|
+
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
|
|
121
123
|
};
|
|
122
124
|
|
|
123
125
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -165,32 +167,34 @@ enum common_params_sampling_config : uint64_t {
|
|
|
165
167
|
struct common_params_sampling {
|
|
166
168
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
167
169
|
|
|
168
|
-
int32_t n_prev = 64;
|
|
169
|
-
int32_t n_probs = 0;
|
|
170
|
-
int32_t min_keep = 0;
|
|
171
|
-
int32_t top_k = 40;
|
|
172
|
-
float top_p = 0.95f;
|
|
173
|
-
float min_p = 0.05f;
|
|
174
|
-
float xtc_probability = 0.00f;
|
|
175
|
-
float xtc_threshold = 0.10f;
|
|
176
|
-
float typ_p = 1.00f;
|
|
177
|
-
float temp = 0.80f;
|
|
178
|
-
float dynatemp_range = 0.00f;
|
|
179
|
-
float dynatemp_exponent = 1.00f;
|
|
180
|
-
int32_t penalty_last_n = 64;
|
|
181
|
-
float penalty_repeat = 1.00f;
|
|
182
|
-
float penalty_freq = 0.00f;
|
|
183
|
-
float penalty_present = 0.00f;
|
|
184
|
-
float dry_multiplier = 0.0f;
|
|
185
|
-
float dry_base = 1.75f;
|
|
186
|
-
int32_t dry_allowed_length = 2;
|
|
187
|
-
int32_t dry_penalty_last_n = -1;
|
|
188
|
-
|
|
189
|
-
float
|
|
190
|
-
|
|
191
|
-
float
|
|
170
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
171
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
172
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
173
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
174
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
175
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
176
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
177
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
178
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
179
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
180
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
181
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
182
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
183
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
184
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
185
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
186
|
+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
187
|
+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
188
|
+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
189
|
+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
190
|
+
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
|
191
|
+
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
|
|
192
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
193
|
+
float top_n_sigma = -1.00f; // -1.0 = disabled
|
|
194
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
195
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
192
196
|
bool ignore_eos = false;
|
|
193
|
-
bool no_perf = false;
|
|
197
|
+
bool no_perf = false; // disable performance metrics
|
|
194
198
|
bool timing_per_token = false;
|
|
195
199
|
|
|
196
200
|
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
|
@@ -476,6 +480,7 @@ struct common_params {
|
|
|
476
480
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
477
481
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
478
482
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
483
|
+
bool cache_prompt = true; // whether to enable prompt caching
|
|
479
484
|
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
480
485
|
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
481
486
|
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#include "debug.h"
|
|
2
|
+
|
|
3
|
+
#include "log.h"
|
|
4
|
+
|
|
5
|
+
#include <cmath>
|
|
6
|
+
#include <string>
|
|
7
|
+
|
|
8
|
+
static std::string common_ggml_ne_string(const ggml_tensor * t) {
|
|
9
|
+
std::string str;
|
|
10
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
|
11
|
+
str += std::to_string(t->ne[i]);
|
|
12
|
+
if (i + 1 < GGML_MAX_DIMS) {
|
|
13
|
+
str += ", ";
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
return str;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
static float common_ggml_get_float_value(const uint8_t * data,
|
|
20
|
+
ggml_type type,
|
|
21
|
+
const size_t * nb,
|
|
22
|
+
size_t i0,
|
|
23
|
+
size_t i1,
|
|
24
|
+
size_t i2,
|
|
25
|
+
size_t i3) {
|
|
26
|
+
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
|
27
|
+
float v;
|
|
28
|
+
if (type == GGML_TYPE_F16) {
|
|
29
|
+
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
|
30
|
+
} else if (type == GGML_TYPE_F32) {
|
|
31
|
+
v = *(const float *) &data[i];
|
|
32
|
+
} else if (type == GGML_TYPE_I64) {
|
|
33
|
+
v = (float) *(const int64_t *) &data[i];
|
|
34
|
+
} else if (type == GGML_TYPE_I32) {
|
|
35
|
+
v = (float) *(const int32_t *) &data[i];
|
|
36
|
+
} else if (type == GGML_TYPE_I16) {
|
|
37
|
+
v = (float) *(const int16_t *) &data[i];
|
|
38
|
+
} else if (type == GGML_TYPE_I8) {
|
|
39
|
+
v = (float) *(const int8_t *) &data[i];
|
|
40
|
+
} else if (type == GGML_TYPE_BF16) {
|
|
41
|
+
v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
|
42
|
+
} else {
|
|
43
|
+
GGML_ABORT("fatal error");
|
|
44
|
+
}
|
|
45
|
+
return v;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
template <bool abort>
|
|
49
|
+
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
|
50
|
+
GGML_ASSERT(n > 0);
|
|
51
|
+
float sum = 0;
|
|
52
|
+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
53
|
+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
54
|
+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
55
|
+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
56
|
+
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
|
57
|
+
sum += v;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
63
|
+
LOG_ERR(" [\n");
|
|
64
|
+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
65
|
+
if (i2 == n && ne[2] > 2 * n) {
|
|
66
|
+
LOG_ERR(" ..., \n");
|
|
67
|
+
i2 = ne[2] - n;
|
|
68
|
+
}
|
|
69
|
+
LOG_ERR(" [\n");
|
|
70
|
+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
71
|
+
if (i1 == n && ne[1] > 2 * n) {
|
|
72
|
+
LOG_ERR(" ..., \n");
|
|
73
|
+
i1 = ne[1] - n;
|
|
74
|
+
}
|
|
75
|
+
LOG_ERR(" [");
|
|
76
|
+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
77
|
+
if (i0 == n && ne[0] > 2 * n) {
|
|
78
|
+
LOG_ERR("..., ");
|
|
79
|
+
i0 = ne[0] - n;
|
|
80
|
+
}
|
|
81
|
+
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
|
82
|
+
LOG_ERR("%12.4f", v);
|
|
83
|
+
if (i0 < ne[0] - 1) {
|
|
84
|
+
LOG_ERR(", ");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
LOG_ERR("],\n");
|
|
88
|
+
}
|
|
89
|
+
LOG_ERR(" ],\n");
|
|
90
|
+
}
|
|
91
|
+
LOG_ERR(" ]\n");
|
|
92
|
+
LOG_ERR(" sum = %f\n", sum);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if constexpr (abort) {
|
|
96
|
+
if (std::isnan(sum)) {
|
|
97
|
+
LOG_ERR("encountered NaN - aborting\n");
|
|
98
|
+
exit(0);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* GGML operations callback during the graph execution.
|
|
105
|
+
*
|
|
106
|
+
* @param t current tensor
|
|
107
|
+
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
|
108
|
+
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
|
109
|
+
* see ggml_backend_sched_eval_callback
|
|
110
|
+
* @param user_data user data to pass at each call back
|
|
111
|
+
* @return true to receive data or continue the graph, false otherwise
|
|
112
|
+
*/
|
|
113
|
+
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
114
|
+
auto * cb_data = (base_callback_data *) user_data;
|
|
115
|
+
|
|
116
|
+
const struct ggml_tensor * src0 = t->src[0];
|
|
117
|
+
const struct ggml_tensor * src1 = t->src[1];
|
|
118
|
+
|
|
119
|
+
if (ask) {
|
|
120
|
+
return true; // Always retrieve data
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
bool matches_filter = cb_data->tensor_filters.empty();
|
|
124
|
+
|
|
125
|
+
if (!matches_filter) {
|
|
126
|
+
for (const auto & filter : cb_data->tensor_filters) {
|
|
127
|
+
if (std::regex_search(t->name, filter)) {
|
|
128
|
+
matches_filter = true;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
char src1_str[128] = { 0 };
|
|
135
|
+
if (src1) {
|
|
136
|
+
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (matches_filter) {
|
|
140
|
+
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
|
|
141
|
+
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
|
|
142
|
+
common_ggml_ne_string(t).c_str());
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
|
146
|
+
|
|
147
|
+
if (!is_host) {
|
|
148
|
+
auto n_bytes = ggml_nbytes(t);
|
|
149
|
+
cb_data->data.resize(n_bytes);
|
|
150
|
+
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (!ggml_is_quantized(t->type) && matches_filter) {
|
|
154
|
+
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
|
155
|
+
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Explicit template instantiations
|
|
162
|
+
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
|
|
163
|
+
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
|
|
164
|
+
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
|
165
|
+
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include "common.h"
|
|
3
|
+
#include <string>
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <regex>
|
|
6
|
+
|
|
7
|
+
// common debug functions and structs
|
|
8
|
+
|
|
9
|
+
// Print a tensor's detailed data
|
|
10
|
+
// data - the tensor's data in byte format
|
|
11
|
+
// type - the tensor's quantization type
|
|
12
|
+
// ne - the tensor dimensions array
|
|
13
|
+
// nb - the tensor strides array
|
|
14
|
+
// n - the number of rows/columns to fully print
|
|
15
|
+
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
|
|
16
|
+
|
|
17
|
+
// Intended to use as callback for ggml_backend_sched_eval_callback
|
|
18
|
+
// prints tensors that are processed in the computation graph
|
|
19
|
+
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
|
|
20
|
+
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
|
|
21
|
+
// The template parameter determins whether an error should be thrown whenever a NaN is encountered
|
|
22
|
+
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
|
|
23
|
+
// The callback data will be passed as the third parameter (user_data)
|
|
24
|
+
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
|
|
25
|
+
struct base_callback_data {
|
|
26
|
+
std::vector<uint8_t> data;
|
|
27
|
+
std::vector<std::regex> tensor_filters;
|
|
28
|
+
|
|
29
|
+
base_callback_data() = default;
|
|
30
|
+
|
|
31
|
+
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
|
32
|
+
for (const auto & pattern : filter_patterns) {
|
|
33
|
+
try {
|
|
34
|
+
std::string anchored_pattern = "^" + pattern;
|
|
35
|
+
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
|
36
|
+
} catch (const std::regex_error & e) {
|
|
37
|
+
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
params.cb_eval = common_debug_cb_eval<false>;
|
|
41
|
+
params.cb_eval_user_data = this;
|
|
42
|
+
}
|
|
43
|
+
};
|