@fugood/llama.node 1.4.14 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/lib/binding.ts +13 -6
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +8 -3
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +77 -65
  6. package/src/LlamaContext.cpp +31 -34
  7. package/src/llama.cpp/CMakeLists.txt +24 -8
  8. package/src/llama.cpp/common/CMakeLists.txt +15 -34
  9. package/src/llama.cpp/common/arg.cpp +59 -10
  10. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  11. package/src/llama.cpp/common/chat.cpp +356 -34
  12. package/src/llama.cpp/common/chat.h +17 -13
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +30 -25
  15. package/src/llama.cpp/common/debug.cpp +165 -0
  16. package/src/llama.cpp/common/debug.h +43 -0
  17. package/src/llama.cpp/common/download.cpp +12 -342
  18. package/src/llama.cpp/common/download.h +6 -0
  19. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  20. package/src/llama.cpp/common/jinja/caps.h +24 -0
  21. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  22. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  23. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  24. package/src/llama.cpp/common/jinja/parser.h +21 -0
  25. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  26. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  27. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  28. package/src/llama.cpp/common/jinja/string.h +58 -0
  29. package/src/llama.cpp/common/jinja/utils.h +49 -0
  30. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  31. package/src/llama.cpp/common/jinja/value.h +464 -0
  32. package/src/llama.cpp/common/preset.cpp +12 -2
  33. package/src/llama.cpp/common/sampling.cpp +52 -19
  34. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  35. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  39. package/src/llama.cpp/include/llama-cpp.h +3 -1
  40. package/src/llama.cpp/include/llama.h +29 -2
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  43. package/src/llama.cpp/src/llama-adapter.h +1 -3
  44. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  45. package/src/llama.cpp/src/llama-arch.h +1 -0
  46. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  47. package/src/llama.cpp/src/llama-chat.h +1 -0
  48. package/src/llama.cpp/src/llama-context.cpp +232 -144
  49. package/src/llama.cpp/src/llama-context.h +10 -0
  50. package/src/llama.cpp/src/llama-cparams.h +2 -0
  51. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  52. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  53. package/src/llama.cpp/src/llama-hparams.h +38 -1
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  55. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  56. package/src/llama.cpp/src/llama-mmap.cpp +13 -6
  57. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  58. package/src/llama.cpp/src/llama-model.cpp +215 -97
  59. package/src/llama.cpp/src/llama-model.h +3 -2
  60. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
  61. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  62. package/src/llama.cpp/src/llama-vocab.h +1 -0
  63. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  64. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  65. package/src/llama.cpp/src/models/models.h +13 -2
  66. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
80
80
  //
81
81
 
82
82
  enum llama_example {
83
+ LLAMA_EXAMPLE_BATCHED,
83
84
  LLAMA_EXAMPLE_DEBUG,
84
85
  LLAMA_EXAMPLE_COMMON,
85
86
  LLAMA_EXAMPLE_SPECULATIVE,
@@ -118,6 +119,7 @@ enum common_sampler_type {
118
119
  COMMON_SAMPLER_TYPE_INFILL = 9,
119
120
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
120
121
  COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
122
+ COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
121
123
  };
122
124
 
123
125
  // dimensionality reduction methods, used by cvector-generator
@@ -165,32 +167,34 @@ enum common_params_sampling_config : uint64_t {
165
167
  struct common_params_sampling {
166
168
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
167
169
 
168
- int32_t n_prev = 64; // number of previous tokens to remember
169
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
170
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
171
- int32_t top_k = 40; // <= 0 to use vocab size
172
- float top_p = 0.95f; // 1.0 = disabled
173
- float min_p = 0.05f; // 0.0 = disabled
174
- float xtc_probability = 0.00f; // 0.0 = disabled
175
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
176
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
177
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
178
- float dynatemp_range = 0.00f; // 0.0 = disabled
179
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
180
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
181
- float penalty_repeat = 1.00f; // 1.0 = disabled
182
- float penalty_freq = 0.00f; // 0.0 = disabled
183
- float penalty_present = 0.00f; // 0.0 = disabled
184
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
185
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
186
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
187
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
188
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
189
- float top_n_sigma = -1.00f;// -1.0 = disabled
190
- float mirostat_tau = 5.00f; // target entropy
191
- float mirostat_eta = 0.10f; // learning rate
170
+ int32_t n_prev = 64; // number of previous tokens to remember
171
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
172
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
173
+ int32_t top_k = 40; // <= 0 to use vocab size
174
+ float top_p = 0.95f; // 1.0 = disabled
175
+ float min_p = 0.05f; // 0.0 = disabled
176
+ float xtc_probability = 0.00f; // 0.0 = disabled
177
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
178
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
179
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
180
+ float dynatemp_range = 0.00f; // 0.0 = disabled
181
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
182
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
183
+ float penalty_repeat = 1.00f; // 1.0 = disabled
184
+ float penalty_freq = 0.00f; // 0.0 = disabled
185
+ float penalty_present = 0.00f; // 0.0 = disabled
186
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
187
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
188
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
189
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
190
+ float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
191
+ float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
192
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
193
+ float top_n_sigma = -1.00f; // -1.0 = disabled
194
+ float mirostat_tau = 5.00f; // target entropy
195
+ float mirostat_eta = 0.10f; // learning rate
192
196
  bool ignore_eos = false;
193
- bool no_perf = false; // disable performance metrics
197
+ bool no_perf = false; // disable performance metrics
194
198
  bool timing_per_token = false;
195
199
 
196
200
  uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
@@ -476,6 +480,7 @@ struct common_params {
476
480
  int32_t timeout_write = timeout_read; // http write timeout in seconds
477
481
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
478
482
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
483
+ bool cache_prompt = true; // whether to enable prompt caching
479
484
  int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
480
485
  int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
481
486
 
@@ -0,0 +1,165 @@
1
+ #include "debug.h"
2
+
3
+ #include "log.h"
4
+
5
+ #include <cmath>
6
+ #include <string>
7
+
8
+ static std::string common_ggml_ne_string(const ggml_tensor * t) {
9
+ std::string str;
10
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
11
+ str += std::to_string(t->ne[i]);
12
+ if (i + 1 < GGML_MAX_DIMS) {
13
+ str += ", ";
14
+ }
15
+ }
16
+ return str;
17
+ }
18
+
19
+ static float common_ggml_get_float_value(const uint8_t * data,
20
+ ggml_type type,
21
+ const size_t * nb,
22
+ size_t i0,
23
+ size_t i1,
24
+ size_t i2,
25
+ size_t i3) {
26
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
27
+ float v;
28
+ if (type == GGML_TYPE_F16) {
29
+ v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
30
+ } else if (type == GGML_TYPE_F32) {
31
+ v = *(const float *) &data[i];
32
+ } else if (type == GGML_TYPE_I64) {
33
+ v = (float) *(const int64_t *) &data[i];
34
+ } else if (type == GGML_TYPE_I32) {
35
+ v = (float) *(const int32_t *) &data[i];
36
+ } else if (type == GGML_TYPE_I16) {
37
+ v = (float) *(const int16_t *) &data[i];
38
+ } else if (type == GGML_TYPE_I8) {
39
+ v = (float) *(const int8_t *) &data[i];
40
+ } else if (type == GGML_TYPE_BF16) {
41
+ v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
42
+ } else {
43
+ GGML_ABORT("fatal error");
44
+ }
45
+ return v;
46
+ }
47
+
48
+ template <bool abort>
49
+ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
50
+ GGML_ASSERT(n > 0);
51
+ float sum = 0;
52
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
53
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
54
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
55
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
56
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
57
+ sum += v;
58
+ }
59
+ }
60
+ }
61
+ }
62
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
63
+ LOG_ERR(" [\n");
64
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
65
+ if (i2 == n && ne[2] > 2 * n) {
66
+ LOG_ERR(" ..., \n");
67
+ i2 = ne[2] - n;
68
+ }
69
+ LOG_ERR(" [\n");
70
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
71
+ if (i1 == n && ne[1] > 2 * n) {
72
+ LOG_ERR(" ..., \n");
73
+ i1 = ne[1] - n;
74
+ }
75
+ LOG_ERR(" [");
76
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
77
+ if (i0 == n && ne[0] > 2 * n) {
78
+ LOG_ERR("..., ");
79
+ i0 = ne[0] - n;
80
+ }
81
+ const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
82
+ LOG_ERR("%12.4f", v);
83
+ if (i0 < ne[0] - 1) {
84
+ LOG_ERR(", ");
85
+ }
86
+ }
87
+ LOG_ERR("],\n");
88
+ }
89
+ LOG_ERR(" ],\n");
90
+ }
91
+ LOG_ERR(" ]\n");
92
+ LOG_ERR(" sum = %f\n", sum);
93
+ }
94
+
95
+ if constexpr (abort) {
96
+ if (std::isnan(sum)) {
97
+ LOG_ERR("encountered NaN - aborting\n");
98
+ exit(0);
99
+ }
100
+ }
101
+ }
102
+
103
+ /**
104
+ * GGML operations callback during the graph execution.
105
+ *
106
+ * @param t current tensor
107
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
108
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
109
+ * see ggml_backend_sched_eval_callback
110
+ * @param user_data user data to pass at each call back
111
+ * @return true to receive data or continue the graph, false otherwise
112
+ */
113
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
114
+ auto * cb_data = (base_callback_data *) user_data;
115
+
116
+ const struct ggml_tensor * src0 = t->src[0];
117
+ const struct ggml_tensor * src1 = t->src[1];
118
+
119
+ if (ask) {
120
+ return true; // Always retrieve data
121
+ }
122
+
123
+ bool matches_filter = cb_data->tensor_filters.empty();
124
+
125
+ if (!matches_filter) {
126
+ for (const auto & filter : cb_data->tensor_filters) {
127
+ if (std::regex_search(t->name, filter)) {
128
+ matches_filter = true;
129
+ break;
130
+ }
131
+ }
132
+ }
133
+
134
+ char src1_str[128] = { 0 };
135
+ if (src1) {
136
+ snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
137
+ }
138
+
139
+ if (matches_filter) {
140
+ LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
141
+ ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
142
+ common_ggml_ne_string(t).c_str());
143
+ }
144
+
145
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
146
+
147
+ if (!is_host) {
148
+ auto n_bytes = ggml_nbytes(t);
149
+ cb_data->data.resize(n_bytes);
150
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
151
+ }
152
+
153
+ if (!ggml_is_quantized(t->type) && matches_filter) {
154
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
155
+ common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
156
+ }
157
+
158
+ return true;
159
+ }
160
+
161
+ // Explicit template instantiations
162
+ template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
163
+ template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
164
+ template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
165
+ template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
@@ -0,0 +1,43 @@
1
+ #pragma once
2
+ #include "common.h"
3
+ #include <string>
4
+ #include <vector>
5
+ #include <regex>
6
+
7
+ // common debug functions and structs
8
+
9
+ // Print a tensor's detailed data
10
+ // data - the tensor's data in byte format
11
+ // type - the tensor's quantization type
12
+ // ne - the tensor dimensions array
13
+ // nb - the tensor strides array
14
+ // n - the number of rows/columns to fully print
15
+ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
16
+
17
+ // Intended to use as callback for ggml_backend_sched_eval_callback
18
+ // prints tensors that are processed in the computation graph
19
+ // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
20
+ // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
21
+ // The template parameter determins whether an error should be thrown whenever a NaN is encountered
22
+ // in a tensor (useful for stopping debug sessions on first erroneous tensor)
23
+ // The callback data will be passed as the third parameter (user_data)
24
+ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
25
+ struct base_callback_data {
26
+ std::vector<uint8_t> data;
27
+ std::vector<std::regex> tensor_filters;
28
+
29
+ base_callback_data() = default;
30
+
31
+ base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
32
+ for (const auto & pattern : filter_patterns) {
33
+ try {
34
+ std::string anchored_pattern = "^" + pattern;
35
+ tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
36
+ } catch (const std::regex_error & e) {
37
+ throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
38
+ }
39
+ }
40
+ params.cb_eval = common_debug_cb_eval<false>;
41
+ params.cb_eval_user_data = this;
42
+ }
43
+ };