@fugood/llama.node 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/llama.cpp/common/arg.cpp +99 -45
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +19 -0
- package/src/llama.cpp/common/common.h +10 -0
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/include/llama.h +87 -8
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +51 -11
- package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +38 -30
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -120,17 +120,34 @@ struct common_sampler {
|
|
|
120
120
|
}
|
|
121
121
|
|
|
122
122
|
void set_logits(struct llama_context * ctx, int idx) {
|
|
123
|
-
const
|
|
123
|
+
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
|
|
124
|
+
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
|
|
125
|
+
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
|
|
124
126
|
|
|
125
127
|
const llama_model * model = llama_get_model(ctx);
|
|
126
128
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
127
129
|
|
|
128
130
|
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
129
131
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
132
|
+
if (sampled_probs) {
|
|
133
|
+
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
|
|
134
|
+
cur.resize(sampled_probs_count);
|
|
135
|
+
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
|
136
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
|
|
137
|
+
}
|
|
138
|
+
} else if (sampled_logits) {
|
|
139
|
+
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
|
|
140
|
+
cur.resize(sampled_logits_count);
|
|
141
|
+
for (uint32_t i = 0; i < sampled_logits_count; i++) {
|
|
142
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
|
|
143
|
+
}
|
|
144
|
+
} else {
|
|
145
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
146
|
+
GGML_ASSERT(logits != nullptr);
|
|
147
|
+
cur.resize(n_vocab);
|
|
148
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
149
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
150
|
+
}
|
|
134
151
|
}
|
|
135
152
|
|
|
136
153
|
cur_p = { cur.data(), cur.size(), -1, false };
|
|
@@ -159,7 +176,7 @@ std::string common_params_sampling::print() const {
|
|
|
159
176
|
return std::string(result);
|
|
160
177
|
}
|
|
161
178
|
|
|
162
|
-
struct common_sampler * common_sampler_init(const struct llama_model * model,
|
|
179
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
|
|
163
180
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
164
181
|
|
|
165
182
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
@@ -179,24 +196,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
179
196
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
180
197
|
} else {
|
|
181
198
|
std::vector<std::string> trigger_patterns;
|
|
182
|
-
std::vector<std::string> patterns_anywhere;
|
|
183
199
|
std::vector<llama_token> trigger_tokens;
|
|
184
200
|
for (const auto & trigger : params.grammar_triggers) {
|
|
185
201
|
switch (trigger.type) {
|
|
186
202
|
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
|
187
203
|
{
|
|
188
204
|
const auto & word = trigger.value;
|
|
189
|
-
|
|
205
|
+
trigger_patterns.push_back(regex_escape(word));
|
|
190
206
|
break;
|
|
191
207
|
}
|
|
192
208
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
|
193
209
|
{
|
|
194
|
-
|
|
210
|
+
trigger_patterns.push_back(trigger.value);
|
|
195
211
|
break;
|
|
196
212
|
}
|
|
197
213
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
|
198
214
|
{
|
|
199
|
-
|
|
215
|
+
const auto & pattern = trigger.value;
|
|
216
|
+
std::string anchored = "^$";
|
|
217
|
+
if (!pattern.empty()) {
|
|
218
|
+
anchored = (pattern.front() != '^' ? "^" : "")
|
|
219
|
+
+ pattern
|
|
220
|
+
+ (pattern.back() != '$' ? "$" : "");
|
|
221
|
+
}
|
|
222
|
+
trigger_patterns.push_back(anchored);
|
|
200
223
|
break;
|
|
201
224
|
}
|
|
202
225
|
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
|
@@ -210,10 +233,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
210
233
|
}
|
|
211
234
|
}
|
|
212
235
|
|
|
213
|
-
if (!patterns_anywhere.empty()) {
|
|
214
|
-
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
|
215
|
-
}
|
|
216
|
-
|
|
217
236
|
std::vector<const char *> trigger_patterns_c;
|
|
218
237
|
trigger_patterns_c.reserve(trigger_patterns.size());
|
|
219
238
|
for (const auto & regex : trigger_patterns) {
|
|
@@ -296,6 +315,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
296
315
|
llama_sampler_chain_add(chain, smpl);
|
|
297
316
|
}
|
|
298
317
|
|
|
318
|
+
if (grmr && params.backend_sampling) {
|
|
319
|
+
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
|
|
320
|
+
|
|
321
|
+
params.backend_sampling = false;
|
|
322
|
+
}
|
|
323
|
+
|
|
299
324
|
auto * result = new common_sampler {
|
|
300
325
|
/* .params = */ params,
|
|
301
326
|
/* .grmr = */ grmr,
|
|
@@ -405,6 +430,25 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
405
430
|
auto & chain = gsmpl->chain;
|
|
406
431
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
407
432
|
|
|
433
|
+
// Check if a backend sampler has already sampled a token in which case we
|
|
434
|
+
// return that token id directly.
|
|
435
|
+
{
|
|
436
|
+
id = llama_get_sampled_token_ith(ctx, idx);
|
|
437
|
+
|
|
438
|
+
if (id != LLAMA_TOKEN_NULL) {
|
|
439
|
+
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
|
440
|
+
|
|
441
|
+
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
|
442
|
+
|
|
443
|
+
// TODO: simplify
|
|
444
|
+
gsmpl->cur.resize(1);
|
|
445
|
+
gsmpl->cur[0] = { id, 0.0f, 1.0f };
|
|
446
|
+
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
|
|
447
|
+
|
|
448
|
+
return id;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
408
452
|
gsmpl->set_logits(ctx, idx);
|
|
409
453
|
|
|
410
454
|
if (grammar_first) {
|
|
@@ -36,7 +36,8 @@ struct common_sampler;
|
|
|
36
36
|
|
|
37
37
|
// llama_sampler API overloads
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
// note: can mutate params in some cases
|
|
40
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
|
|
40
41
|
|
|
41
42
|
void common_sampler_free(struct common_sampler * gsmpl);
|
|
42
43
|
|
|
@@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
|
48
49
|
// arguments can be nullptr to skip printing
|
|
49
50
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
51
|
|
|
52
|
+
// get the underlying llama_sampler_chain
|
|
51
53
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
52
54
|
|
|
53
55
|
// extended sampling implementation:
|
|
@@ -316,6 +316,11 @@ extern "C" {
|
|
|
316
316
|
bool no_alloc; // only load metadata and simulate memory allocations
|
|
317
317
|
};
|
|
318
318
|
|
|
319
|
+
struct llama_sampler_seq_config {
|
|
320
|
+
llama_seq_id seq_id;
|
|
321
|
+
struct llama_sampler * sampler;
|
|
322
|
+
};
|
|
323
|
+
|
|
319
324
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
320
325
|
// https://github.com/ggml-org/llama.cpp/pull/7544
|
|
321
326
|
struct llama_context_params {
|
|
@@ -364,6 +369,12 @@ extern "C" {
|
|
|
364
369
|
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
365
370
|
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
366
371
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
372
|
+
|
|
373
|
+
// [EXPERIMENTAL]
|
|
374
|
+
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
|
|
375
|
+
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
|
376
|
+
struct llama_sampler_seq_config * samplers;
|
|
377
|
+
size_t n_samplers;
|
|
367
378
|
};
|
|
368
379
|
|
|
369
380
|
// model quantization parameters
|
|
@@ -524,6 +535,7 @@ extern "C" {
|
|
|
524
535
|
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
|
525
536
|
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
526
537
|
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
|
538
|
+
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
|
527
539
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
528
540
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
529
541
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
@@ -992,6 +1004,32 @@ extern "C" {
|
|
|
992
1004
|
// otherwise: float[n_embd] (1-dimensional)
|
|
993
1005
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
994
1006
|
|
|
1007
|
+
//
|
|
1008
|
+
// backend sampling API [EXPERIMENTAL]
|
|
1009
|
+
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
|
|
1010
|
+
//
|
|
1011
|
+
|
|
1012
|
+
// Get the backend sampled token for the ith token.
|
|
1013
|
+
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
|
1014
|
+
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
|
1015
|
+
|
|
1016
|
+
// Get the backend sampled probabilites for the ith token
|
|
1017
|
+
// The index matches llama_get_sampled_token_ith().
|
|
1018
|
+
// Returns NULL if no probabilites were generated.
|
|
1019
|
+
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
|
1020
|
+
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
|
1021
|
+
|
|
1022
|
+
// Get the backend sampled logits for the ith token
|
|
1023
|
+
// Returns NULL if no logits were sampled.
|
|
1024
|
+
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
|
|
1025
|
+
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
|
|
1026
|
+
|
|
1027
|
+
// Get the backend sampled candidates (token ids) for the ith token
|
|
1028
|
+
// These are needed to map probability/logit indices to vocab token ids.
|
|
1029
|
+
// Returns NULL if no candidates were sampled.
|
|
1030
|
+
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
|
|
1031
|
+
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
|
|
1032
|
+
|
|
995
1033
|
//
|
|
996
1034
|
// Vocab
|
|
997
1035
|
//
|
|
@@ -1163,11 +1201,16 @@ extern "C" {
|
|
|
1163
1201
|
//
|
|
1164
1202
|
// llama_sampler_free(smpl);
|
|
1165
1203
|
//
|
|
1166
|
-
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
|
1167
|
-
//
|
|
1168
1204
|
|
|
1169
1205
|
typedef void * llama_sampler_context_t;
|
|
1170
1206
|
|
|
1207
|
+
struct llama_sampler_data {
|
|
1208
|
+
struct ggml_tensor * logits;
|
|
1209
|
+
struct ggml_tensor * probs;
|
|
1210
|
+
struct ggml_tensor * sampled;
|
|
1211
|
+
struct ggml_tensor * candidates;
|
|
1212
|
+
};
|
|
1213
|
+
|
|
1171
1214
|
// user code can implement the interface below in order to create custom llama_sampler
|
|
1172
1215
|
struct llama_sampler_i {
|
|
1173
1216
|
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
|
|
@@ -1177,17 +1220,45 @@ extern "C" {
|
|
|
1177
1220
|
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1178
1221
|
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1179
1222
|
|
|
1180
|
-
//
|
|
1181
|
-
//
|
|
1223
|
+
// [EXPERIMENTAL]
|
|
1224
|
+
// backend sampling interface:
|
|
1225
|
+
|
|
1226
|
+
// return true if the backend supports all ops needed by the sampler
|
|
1227
|
+
// note: call once per sampler
|
|
1228
|
+
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
|
|
1229
|
+
|
|
1230
|
+
// call after .backend_apply()
|
|
1231
|
+
void (*backend_accept)(
|
|
1232
|
+
struct llama_sampler * smpl,
|
|
1233
|
+
struct ggml_context * ctx,
|
|
1234
|
+
struct ggml_cgraph * gf,
|
|
1235
|
+
struct ggml_tensor * selected_token);
|
|
1236
|
+
|
|
1237
|
+
// call after .backend_init()
|
|
1238
|
+
void (*backend_apply)(
|
|
1239
|
+
struct llama_sampler * smpl,
|
|
1240
|
+
struct ggml_context * ctx,
|
|
1241
|
+
struct ggml_cgraph * gf,
|
|
1242
|
+
struct llama_sampler_data * data);
|
|
1243
|
+
|
|
1244
|
+
// called before graph execution to set inputs for the current ubatch
|
|
1245
|
+
void (*backend_set_input)(struct llama_sampler * smpl);
|
|
1182
1246
|
};
|
|
1183
1247
|
|
|
1184
1248
|
struct llama_sampler {
|
|
1185
|
-
|
|
1186
|
-
|
|
1249
|
+
struct llama_sampler_i * iface;
|
|
1250
|
+
|
|
1251
|
+
llama_sampler_context_t ctx;
|
|
1187
1252
|
};
|
|
1188
1253
|
|
|
1254
|
+
// [EXPERIMENTAL]
|
|
1255
|
+
// attach a sampler to the context
|
|
1256
|
+
// note: prefer initializing the context with llama_context_params.samplers when possible
|
|
1257
|
+
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
|
1258
|
+
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
|
1259
|
+
|
|
1189
1260
|
// mirror of llama_sampler_i:
|
|
1190
|
-
LLAMA_API struct llama_sampler * llama_sampler_init (
|
|
1261
|
+
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
|
1191
1262
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
|
1192
1263
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
|
1193
1264
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
|
@@ -1203,7 +1274,15 @@ extern "C" {
|
|
|
1203
1274
|
|
|
1204
1275
|
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
|
|
1205
1276
|
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
|
|
1206
|
-
|
|
1277
|
+
|
|
1278
|
+
// return NULL if:
|
|
1279
|
+
// - the sampler is NULL
|
|
1280
|
+
// - the sampler is not a llama_sampler_chain
|
|
1281
|
+
// - the index is out of bounds, unless i == -1
|
|
1282
|
+
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
|
|
1283
|
+
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
|
|
1284
|
+
|
|
1285
|
+
// the total number of samplers in the chain
|
|
1207
1286
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
|
1208
1287
|
|
|
1209
1288
|
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
|
@@ -152,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
152
152
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
153
153
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
154
154
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
155
|
+
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
|
155
156
|
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
|
156
157
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
157
158
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
|
@@ -2075,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
2075
2076
|
LLM_TENSOR_TOKEN_EMBD,
|
|
2076
2077
|
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
|
2077
2078
|
LLM_TENSOR_OUTPUT,
|
|
2079
|
+
LLM_TENSOR_DENSE_2_OUT,
|
|
2078
2080
|
};
|
|
2079
2081
|
case LLM_ARCH_LFM2MOE:
|
|
2080
2082
|
return {
|