@fugood/llama.node 1.4.13 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +23 -2
- package/lib/index.js +2 -1
- package/lib/index.ts +8 -1
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -12
- package/src/LlamaContext.cpp +16 -4
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +183 -60
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +12 -7
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +88 -369
- package/src/llama.cpp/common/download.h +32 -5
- package/src/llama.cpp/common/preset.cpp +87 -2
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +78 -42
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +225 -101
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +63 -27
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#include "debug.h"
|
|
2
|
+
|
|
3
|
+
#include "log.h"
|
|
4
|
+
|
|
5
|
+
#include <cmath>
|
|
6
|
+
#include <string>
|
|
7
|
+
|
|
8
|
+
static std::string common_ggml_ne_string(const ggml_tensor * t) {
|
|
9
|
+
std::string str;
|
|
10
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
|
11
|
+
str += std::to_string(t->ne[i]);
|
|
12
|
+
if (i + 1 < GGML_MAX_DIMS) {
|
|
13
|
+
str += ", ";
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
return str;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
static float common_ggml_get_float_value(const uint8_t * data,
|
|
20
|
+
ggml_type type,
|
|
21
|
+
const size_t * nb,
|
|
22
|
+
size_t i0,
|
|
23
|
+
size_t i1,
|
|
24
|
+
size_t i2,
|
|
25
|
+
size_t i3) {
|
|
26
|
+
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
|
27
|
+
float v;
|
|
28
|
+
if (type == GGML_TYPE_F16) {
|
|
29
|
+
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
|
30
|
+
} else if (type == GGML_TYPE_F32) {
|
|
31
|
+
v = *(const float *) &data[i];
|
|
32
|
+
} else if (type == GGML_TYPE_I64) {
|
|
33
|
+
v = (float) *(const int64_t *) &data[i];
|
|
34
|
+
} else if (type == GGML_TYPE_I32) {
|
|
35
|
+
v = (float) *(const int32_t *) &data[i];
|
|
36
|
+
} else if (type == GGML_TYPE_I16) {
|
|
37
|
+
v = (float) *(const int16_t *) &data[i];
|
|
38
|
+
} else if (type == GGML_TYPE_I8) {
|
|
39
|
+
v = (float) *(const int8_t *) &data[i];
|
|
40
|
+
} else if (type == GGML_TYPE_BF16) {
|
|
41
|
+
v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
|
42
|
+
} else {
|
|
43
|
+
GGML_ABORT("fatal error");
|
|
44
|
+
}
|
|
45
|
+
return v;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
template <bool abort>
|
|
49
|
+
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
|
50
|
+
GGML_ASSERT(n > 0);
|
|
51
|
+
float sum = 0;
|
|
52
|
+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
53
|
+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
54
|
+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
55
|
+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
56
|
+
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
|
57
|
+
sum += v;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
63
|
+
LOG_ERR(" [\n");
|
|
64
|
+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
65
|
+
if (i2 == n && ne[2] > 2 * n) {
|
|
66
|
+
LOG_ERR(" ..., \n");
|
|
67
|
+
i2 = ne[2] - n;
|
|
68
|
+
}
|
|
69
|
+
LOG_ERR(" [\n");
|
|
70
|
+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
71
|
+
if (i1 == n && ne[1] > 2 * n) {
|
|
72
|
+
LOG_ERR(" ..., \n");
|
|
73
|
+
i1 = ne[1] - n;
|
|
74
|
+
}
|
|
75
|
+
LOG_ERR(" [");
|
|
76
|
+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
77
|
+
if (i0 == n && ne[0] > 2 * n) {
|
|
78
|
+
LOG_ERR("..., ");
|
|
79
|
+
i0 = ne[0] - n;
|
|
80
|
+
}
|
|
81
|
+
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
|
82
|
+
LOG_ERR("%12.4f", v);
|
|
83
|
+
if (i0 < ne[0] - 1) {
|
|
84
|
+
LOG_ERR(", ");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
LOG_ERR("],\n");
|
|
88
|
+
}
|
|
89
|
+
LOG_ERR(" ],\n");
|
|
90
|
+
}
|
|
91
|
+
LOG_ERR(" ]\n");
|
|
92
|
+
LOG_ERR(" sum = %f\n", sum);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if constexpr (abort) {
|
|
96
|
+
if (std::isnan(sum)) {
|
|
97
|
+
LOG_ERR("encountered NaN - aborting\n");
|
|
98
|
+
exit(0);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* GGML operations callback during the graph execution.
|
|
105
|
+
*
|
|
106
|
+
* @param t current tensor
|
|
107
|
+
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
|
108
|
+
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
|
109
|
+
* see ggml_backend_sched_eval_callback
|
|
110
|
+
* @param user_data user data to pass at each call back
|
|
111
|
+
* @return true to receive data or continue the graph, false otherwise
|
|
112
|
+
*/
|
|
113
|
+
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
114
|
+
auto * cb_data = (base_callback_data *) user_data;
|
|
115
|
+
|
|
116
|
+
const struct ggml_tensor * src0 = t->src[0];
|
|
117
|
+
const struct ggml_tensor * src1 = t->src[1];
|
|
118
|
+
|
|
119
|
+
if (ask) {
|
|
120
|
+
return true; // Always retrieve data
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
bool matches_filter = cb_data->tensor_filters.empty();
|
|
124
|
+
|
|
125
|
+
if (!matches_filter) {
|
|
126
|
+
for (const auto & filter : cb_data->tensor_filters) {
|
|
127
|
+
if (std::regex_search(t->name, filter)) {
|
|
128
|
+
matches_filter = true;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
char src1_str[128] = { 0 };
|
|
135
|
+
if (src1) {
|
|
136
|
+
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (matches_filter) {
|
|
140
|
+
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
|
|
141
|
+
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
|
|
142
|
+
common_ggml_ne_string(t).c_str());
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
|
146
|
+
|
|
147
|
+
if (!is_host) {
|
|
148
|
+
auto n_bytes = ggml_nbytes(t);
|
|
149
|
+
cb_data->data.resize(n_bytes);
|
|
150
|
+
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (!ggml_is_quantized(t->type) && matches_filter) {
|
|
154
|
+
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
|
155
|
+
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Explicit template instantiations
|
|
162
|
+
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
|
|
163
|
+
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
|
|
164
|
+
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
|
165
|
+
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include "common.h"
|
|
3
|
+
#include <string>
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <regex>
|
|
6
|
+
|
|
7
|
+
// common debug functions and structs
|
|
8
|
+
|
|
9
|
+
// Print a tensor's detailed data
|
|
10
|
+
// data - the tensor's data in byte format
|
|
11
|
+
// type - the tensor's quantization type
|
|
12
|
+
// ne - the tensor dimensions array
|
|
13
|
+
// nb - the tensor strides array
|
|
14
|
+
// n - the number of rows/columns to fully print
|
|
15
|
+
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
|
|
16
|
+
|
|
17
|
+
// Intended to use as callback for ggml_backend_sched_eval_callback
|
|
18
|
+
// prints tensors that are processed in the computation graph
|
|
19
|
+
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
|
|
20
|
+
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
|
|
21
|
+
// The template parameter determins whether an error should be thrown whenever a NaN is encountered
|
|
22
|
+
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
|
|
23
|
+
// The callback data will be passed as the third parameter (user_data)
|
|
24
|
+
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
|
|
25
|
+
struct base_callback_data {
|
|
26
|
+
std::vector<uint8_t> data;
|
|
27
|
+
std::vector<std::regex> tensor_filters;
|
|
28
|
+
|
|
29
|
+
base_callback_data() = default;
|
|
30
|
+
|
|
31
|
+
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
|
32
|
+
for (const auto & pattern : filter_patterns) {
|
|
33
|
+
try {
|
|
34
|
+
std::string anchored_pattern = "^" + pattern;
|
|
35
|
+
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
|
36
|
+
} catch (const std::regex_error & e) {
|
|
37
|
+
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
params.cb_eval = common_debug_cb_eval<false>;
|
|
41
|
+
params.cb_eval_user_data = this;
|
|
42
|
+
}
|
|
43
|
+
};
|