@fugood/llama.node 1.4.13 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/lib/binding.ts +23 -2
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +8 -1
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -12
  7. package/src/LlamaContext.cpp +16 -4
  8. package/src/llama.cpp/CMakeLists.txt +24 -8
  9. package/src/llama.cpp/common/CMakeLists.txt +3 -34
  10. package/src/llama.cpp/common/arg.cpp +183 -60
  11. package/src/llama.cpp/common/arg.h +0 -8
  12. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  13. package/src/llama.cpp/common/chat.cpp +67 -0
  14. package/src/llama.cpp/common/chat.h +1 -0
  15. package/src/llama.cpp/common/common.cpp +2 -1
  16. package/src/llama.cpp/common/common.h +12 -7
  17. package/src/llama.cpp/common/debug.cpp +165 -0
  18. package/src/llama.cpp/common/debug.h +43 -0
  19. package/src/llama.cpp/common/download.cpp +88 -369
  20. package/src/llama.cpp/common/download.h +32 -5
  21. package/src/llama.cpp/common/preset.cpp +87 -2
  22. package/src/llama.cpp/common/preset.h +10 -1
  23. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  24. package/src/llama.cpp/include/llama.h +5 -2
  25. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  27. package/src/llama.cpp/src/llama-arch.h +1 -0
  28. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  29. package/src/llama.cpp/src/llama-chat.h +1 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  31. package/src/llama.cpp/src/llama-mmap.cpp +78 -42
  32. package/src/llama.cpp/src/llama-mmap.h +5 -4
  33. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  34. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  35. package/src/llama.cpp/src/llama-model.cpp +225 -101
  36. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  37. package/src/llama.cpp/src/llama-sampling.cpp +1 -1
  38. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  39. package/src/llama.cpp/src/llama-vocab.h +1 -0
  40. package/src/llama.cpp/src/llama.cpp +63 -27
  41. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  42. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  43. package/src/llama.cpp/src/models/models.h +13 -2
  44. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -0,0 +1,165 @@
1
+ #include "debug.h"
2
+
3
+ #include "log.h"
4
+
5
+ #include <cmath>
6
+ #include <string>
7
+
8
+ static std::string common_ggml_ne_string(const ggml_tensor * t) {
9
+ std::string str;
10
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
11
+ str += std::to_string(t->ne[i]);
12
+ if (i + 1 < GGML_MAX_DIMS) {
13
+ str += ", ";
14
+ }
15
+ }
16
+ return str;
17
+ }
18
+
19
+ static float common_ggml_get_float_value(const uint8_t * data,
20
+ ggml_type type,
21
+ const size_t * nb,
22
+ size_t i0,
23
+ size_t i1,
24
+ size_t i2,
25
+ size_t i3) {
26
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
27
+ float v;
28
+ if (type == GGML_TYPE_F16) {
29
+ v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
30
+ } else if (type == GGML_TYPE_F32) {
31
+ v = *(const float *) &data[i];
32
+ } else if (type == GGML_TYPE_I64) {
33
+ v = (float) *(const int64_t *) &data[i];
34
+ } else if (type == GGML_TYPE_I32) {
35
+ v = (float) *(const int32_t *) &data[i];
36
+ } else if (type == GGML_TYPE_I16) {
37
+ v = (float) *(const int16_t *) &data[i];
38
+ } else if (type == GGML_TYPE_I8) {
39
+ v = (float) *(const int8_t *) &data[i];
40
+ } else if (type == GGML_TYPE_BF16) {
41
+ v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
42
+ } else {
43
+ GGML_ABORT("fatal error");
44
+ }
45
+ return v;
46
+ }
47
+
48
+ template <bool abort>
49
+ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
50
+ GGML_ASSERT(n > 0);
51
+ float sum = 0;
52
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
53
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
54
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
55
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
56
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
57
+ sum += v;
58
+ }
59
+ }
60
+ }
61
+ }
62
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
63
+ LOG_ERR(" [\n");
64
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
65
+ if (i2 == n && ne[2] > 2 * n) {
66
+ LOG_ERR(" ..., \n");
67
+ i2 = ne[2] - n;
68
+ }
69
+ LOG_ERR(" [\n");
70
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
71
+ if (i1 == n && ne[1] > 2 * n) {
72
+ LOG_ERR(" ..., \n");
73
+ i1 = ne[1] - n;
74
+ }
75
+ LOG_ERR(" [");
76
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
77
+ if (i0 == n && ne[0] > 2 * n) {
78
+ LOG_ERR("..., ");
79
+ i0 = ne[0] - n;
80
+ }
81
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
82
+ LOG_ERR("%12.4f", v);
83
+ if (i0 < ne[0] - 1) {
84
+ LOG_ERR(", ");
85
+ }
86
+ }
87
+ LOG_ERR("],\n");
88
+ }
89
+ LOG_ERR(" ],\n");
90
+ }
91
+ LOG_ERR(" ]\n");
92
+ LOG_ERR(" sum = %f\n", sum);
93
+ }
94
+
95
+ if constexpr (abort) {
96
+ if (std::isnan(sum)) {
97
+ LOG_ERR("encountered NaN - aborting\n");
98
+ exit(0);
99
+ }
100
+ }
101
+ }
102
+
103
+ /**
104
+ * GGML operations callback during the graph execution.
105
+ *
106
+ * @param t current tensor
107
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
108
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
109
+ * see ggml_backend_sched_eval_callback
110
+ * @param user_data user data to pass at each call back
111
+ * @return true to receive data or continue the graph, false otherwise
112
+ */
113
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
114
+ auto * cb_data = (base_callback_data *) user_data;
115
+
116
+ const struct ggml_tensor * src0 = t->src[0];
117
+ const struct ggml_tensor * src1 = t->src[1];
118
+
119
+ if (ask) {
120
+ return true; // Always retrieve data
121
+ }
122
+
123
+ bool matches_filter = cb_data->tensor_filters.empty();
124
+
125
+ if (!matches_filter) {
126
+ for (const auto & filter : cb_data->tensor_filters) {
127
+ if (std::regex_search(t->name, filter)) {
128
+ matches_filter = true;
129
+ break;
130
+ }
131
+ }
132
+ }
133
+
134
+ char src1_str[128] = { 0 };
135
+ if (src1) {
136
+ snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
137
+ }
138
+
139
+ if (matches_filter) {
140
+ LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
141
+ ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
142
+ common_ggml_ne_string(t).c_str());
143
+ }
144
+
145
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
146
+
147
+ if (!is_host) {
148
+ auto n_bytes = ggml_nbytes(t);
149
+ cb_data->data.resize(n_bytes);
150
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
151
+ }
152
+
153
+ if (!ggml_is_quantized(t->type) && matches_filter) {
154
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
155
+ common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
156
+ }
157
+
158
+ return true;
159
+ }
160
+
161
+ // Explicit template instantiations
162
+ template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
163
+ template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
164
+ template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
165
+ template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
@@ -0,0 +1,43 @@
1
+ #pragma once
2
+ #include "common.h"
3
+ #include <string>
4
+ #include <vector>
5
+ #include <regex>
6
+
7
+ // common debug functions and structs
8
+
9
+ // Print a tensor's detailed data
10
+ // data - the tensor's data in byte format
11
+ // type - the tensor's quantization type
12
+ // ne - the tensor dimensions array
13
+ // nb - the tensor strides array
14
+ // n - the number of rows/columns to fully print
15
+ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
16
+
17
+ // Intended to use as callback for ggml_backend_sched_eval_callback
18
+ // prints tensors that are processed in the computation graph
19
+ // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
20
+ // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
21
+ // The template parameter determins whether an error should be thrown whenever a NaN is encountered
22
+ // in a tensor (useful for stopping debug sessions on first erroneous tensor)
23
+ // The callback data will be passed as the third parameter (user_data)
24
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
25
+ struct base_callback_data {
26
+ std::vector<uint8_t> data;
27
+ std::vector<std::regex> tensor_filters;
28
+
29
+ base_callback_data() = default;
30
+
31
+ base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
32
+ for (const auto & pattern : filter_patterns) {
33
+ try {
34
+ std::string anchored_pattern = "^" + pattern;
35
+ tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
36
+ } catch (const std::regex_error & e) {
37
+ throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
38
+ }
39
+ }
40
+ params.cb_eval = common_debug_cb_eval<false>;
41
+ params.cb_eval_user_data = this;
42
+ }
43
+ };