@fugood/llama.node 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +9 -9
  3. package/src/llama.cpp/common/arg.cpp +99 -45
  4. package/src/llama.cpp/common/chat.cpp +4 -4
  5. package/src/llama.cpp/common/common.cpp +19 -0
  6. package/src/llama.cpp/common/common.h +10 -0
  7. package/src/llama.cpp/common/llguidance.cpp +10 -6
  8. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  9. package/src/llama.cpp/common/sampling.cpp +58 -14
  10. package/src/llama.cpp/common/sampling.h +3 -1
  11. package/src/llama.cpp/include/llama.h +87 -8
  12. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  13. package/src/llama.cpp/src/llama-arch.h +1 -0
  14. package/src/llama.cpp/src/llama-context.cpp +615 -28
  15. package/src/llama.cpp/src/llama-context.h +43 -1
  16. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  17. package/src/llama.cpp/src/llama-grammar.h +2 -0
  18. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  19. package/src/llama.cpp/src/llama-graph.h +71 -6
  20. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  21. package/src/llama.cpp/src/llama-hparams.h +8 -2
  22. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  23. package/src/llama.cpp/src/llama-model.cpp +51 -11
  24. package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
  25. package/src/llama.cpp/src/llama-sampling.h +16 -7
  26. package/src/llama.cpp/src/llama.cpp +38 -30
  27. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  28. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  29. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  30. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  31. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  32. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  33. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -120,17 +120,34 @@ struct common_sampler {
120
120
  }
121
121
 
122
122
  void set_logits(struct llama_context * ctx, int idx) {
123
- const auto * logits = llama_get_logits_ith(ctx, idx);
123
+ const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
124
+ const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
125
+ const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
124
126
 
125
127
  const llama_model * model = llama_get_model(ctx);
126
128
  const llama_vocab * vocab = llama_model_get_vocab(model);
127
129
 
128
130
  const int n_vocab = llama_vocab_n_tokens(vocab);
129
131
 
130
- cur.resize(n_vocab);
131
-
132
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
133
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
132
+ if (sampled_probs) {
133
+ const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
134
+ cur.resize(sampled_probs_count);
135
+ for (uint32_t i = 0; i < sampled_probs_count; ++i) {
136
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
137
+ }
138
+ } else if (sampled_logits) {
139
+ const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
140
+ cur.resize(sampled_logits_count);
141
+ for (uint32_t i = 0; i < sampled_logits_count; i++) {
142
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
143
+ }
144
+ } else {
145
+ const auto * logits = llama_get_logits_ith(ctx, idx);
146
+ GGML_ASSERT(logits != nullptr);
147
+ cur.resize(n_vocab);
148
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
149
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
150
+ }
134
151
  }
135
152
 
136
153
  cur_p = { cur.data(), cur.size(), -1, false };
@@ -159,7 +176,7 @@ std::string common_params_sampling::print() const {
159
176
  return std::string(result);
160
177
  }
161
178
 
162
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
179
+ struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
163
180
  const llama_vocab * vocab = llama_model_get_vocab(model);
164
181
 
165
182
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
@@ -179,24 +196,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
179
196
  #endif // LLAMA_USE_LLGUIDANCE
180
197
  } else {
181
198
  std::vector<std::string> trigger_patterns;
182
- std::vector<std::string> patterns_anywhere;
183
199
  std::vector<llama_token> trigger_tokens;
184
200
  for (const auto & trigger : params.grammar_triggers) {
185
201
  switch (trigger.type) {
186
202
  case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
187
203
  {
188
204
  const auto & word = trigger.value;
189
- patterns_anywhere.push_back(regex_escape(word));
205
+ trigger_patterns.push_back(regex_escape(word));
190
206
  break;
191
207
  }
192
208
  case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
193
209
  {
194
- patterns_anywhere.push_back(trigger.value);
210
+ trigger_patterns.push_back(trigger.value);
195
211
  break;
196
212
  }
197
213
  case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
198
214
  {
199
- trigger_patterns.push_back(trigger.value);
215
+ const auto & pattern = trigger.value;
216
+ std::string anchored = "^$";
217
+ if (!pattern.empty()) {
218
+ anchored = (pattern.front() != '^' ? "^" : "")
219
+ + pattern
220
+ + (pattern.back() != '$' ? "$" : "");
221
+ }
222
+ trigger_patterns.push_back(anchored);
200
223
  break;
201
224
  }
202
225
  case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -210,10 +233,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
210
233
  }
211
234
  }
212
235
 
213
- if (!patterns_anywhere.empty()) {
214
- trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
215
- }
216
-
217
236
  std::vector<const char *> trigger_patterns_c;
218
237
  trigger_patterns_c.reserve(trigger_patterns.size());
219
238
  for (const auto & regex : trigger_patterns) {
@@ -296,6 +315,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
296
315
  llama_sampler_chain_add(chain, smpl);
297
316
  }
298
317
 
318
+ if (grmr && params.backend_sampling) {
319
+ LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
320
+
321
+ params.backend_sampling = false;
322
+ }
323
+
299
324
  auto * result = new common_sampler {
300
325
  /* .params = */ params,
301
326
  /* .grmr = */ grmr,
@@ -405,6 +430,25 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
405
430
  auto & chain = gsmpl->chain;
406
431
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
407
432
 
433
+ // Check if a backend sampler has already sampled a token in which case we
434
+ // return that token id directly.
435
+ {
436
+ id = llama_get_sampled_token_ith(ctx, idx);
437
+
438
+ if (id != LLAMA_TOKEN_NULL) {
439
+ LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
440
+
441
+ GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
442
+
443
+ // TODO: simplify
444
+ gsmpl->cur.resize(1);
445
+ gsmpl->cur[0] = { id, 0.0f, 1.0f };
446
+ cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
447
+
448
+ return id;
449
+ }
450
+ }
451
+
408
452
  gsmpl->set_logits(ctx, idx);
409
453
 
410
454
  if (grammar_first) {
@@ -36,7 +36,8 @@ struct common_sampler;
36
36
 
37
37
  // llama_sampler API overloads
38
38
 
39
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
39
+ // note: can mutate params in some cases
40
+ struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
40
41
 
41
42
  void common_sampler_free(struct common_sampler * gsmpl);
42
43
 
@@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
48
49
  // arguments can be nullptr to skip printing
49
50
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
51
 
52
+ // get the underlying llama_sampler_chain
51
53
  struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
52
54
 
53
55
  // extended sampling implementation:
@@ -316,6 +316,11 @@ extern "C" {
316
316
  bool no_alloc; // only load metadata and simulate memory allocations
317
317
  };
318
318
 
319
+ struct llama_sampler_seq_config {
320
+ llama_seq_id seq_id;
321
+ struct llama_sampler * sampler;
322
+ };
323
+
319
324
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
320
325
  // https://github.com/ggml-org/llama.cpp/pull/7544
321
326
  struct llama_context_params {
@@ -364,6 +369,12 @@ extern "C" {
364
369
  bool kv_unified; // use a unified buffer across the input sequences when computing the attention
365
370
  // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
366
371
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
372
+
373
+ // [EXPERIMENTAL]
374
+ // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
375
+ // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
376
+ struct llama_sampler_seq_config * samplers;
377
+ size_t n_samplers;
367
378
  };
368
379
 
369
380
  // model quantization parameters
@@ -524,6 +535,7 @@ extern "C" {
524
535
  LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
525
536
  LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
526
537
  LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
538
+ LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
527
539
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
528
540
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
529
541
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
@@ -992,6 +1004,32 @@ extern "C" {
992
1004
  // otherwise: float[n_embd] (1-dimensional)
993
1005
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
994
1006
 
1007
+ //
1008
+ // backend sampling API [EXPERIMENTAL]
1009
+ // note: use only if the llama_context was created with at least one llama_sampler_seq_config
1010
+ //
1011
+
1012
+ // Get the backend sampled token for the ith token.
1013
+ // Returns LLAMA_TOKEN_NULL if no token was sampled.
1014
+ LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
1015
+
1016
+ // Get the backend sampled probabilites for the ith token
1017
+ // The index matches llama_get_sampled_token_ith().
1018
+ // Returns NULL if no probabilites were generated.
1019
+ LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
1020
+ LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
1021
+
1022
+ // Get the backend sampled logits for the ith token
1023
+ // Returns NULL if no logits were sampled.
1024
+ LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
1025
+ LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
1026
+
1027
+ // Get the backend sampled candidates (token ids) for the ith token
1028
+ // These are needed to map probability/logit indices to vocab token ids.
1029
+ // Returns NULL if no candidates were sampled.
1030
+ LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
1031
+ LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
1032
+
995
1033
  //
996
1034
  // Vocab
997
1035
  //
@@ -1163,11 +1201,16 @@ extern "C" {
1163
1201
  //
1164
1202
  // llama_sampler_free(smpl);
1165
1203
  //
1166
- // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
1167
- //
1168
1204
 
1169
1205
  typedef void * llama_sampler_context_t;
1170
1206
 
1207
+ struct llama_sampler_data {
1208
+ struct ggml_tensor * logits;
1209
+ struct ggml_tensor * probs;
1210
+ struct ggml_tensor * sampled;
1211
+ struct ggml_tensor * candidates;
1212
+ };
1213
+
1171
1214
  // user code can implement the interface below in order to create custom llama_sampler
1172
1215
  struct llama_sampler_i {
1173
1216
  const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@@ -1177,17 +1220,45 @@ extern "C" {
1177
1220
  struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
1178
1221
  void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
1179
1222
 
1180
- // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
1181
- //void (*apply_ggml) (struct llama_sampler * smpl, ...);
1223
+ // [EXPERIMENTAL]
1224
+ // backend sampling interface:
1225
+
1226
+ // return true if the backend supports all ops needed by the sampler
1227
+ // note: call once per sampler
1228
+ bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
1229
+
1230
+ // call after .backend_apply()
1231
+ void (*backend_accept)(
1232
+ struct llama_sampler * smpl,
1233
+ struct ggml_context * ctx,
1234
+ struct ggml_cgraph * gf,
1235
+ struct ggml_tensor * selected_token);
1236
+
1237
+ // call after .backend_init()
1238
+ void (*backend_apply)(
1239
+ struct llama_sampler * smpl,
1240
+ struct ggml_context * ctx,
1241
+ struct ggml_cgraph * gf,
1242
+ struct llama_sampler_data * data);
1243
+
1244
+ // called before graph execution to set inputs for the current ubatch
1245
+ void (*backend_set_input)(struct llama_sampler * smpl);
1182
1246
  };
1183
1247
 
1184
1248
  struct llama_sampler {
1185
- const struct llama_sampler_i * iface;
1186
- llama_sampler_context_t ctx;
1249
+ struct llama_sampler_i * iface;
1250
+
1251
+ llama_sampler_context_t ctx;
1187
1252
  };
1188
1253
 
1254
+ // [EXPERIMENTAL]
1255
+ // attach a sampler to the context
1256
+ // note: prefer initializing the context with llama_context_params.samplers when possible
1257
+ // note: changing the samplers of a context can cause graph reallocations and degraded performance
1258
+ LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1259
+
1189
1260
  // mirror of llama_sampler_i:
1190
- LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1261
+ LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1191
1262
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1192
1263
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1193
1264
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1203,7 +1274,15 @@ extern "C" {
1203
1274
 
1204
1275
  // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
1205
1276
  LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
1206
- LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
1277
+
1278
+ // return NULL if:
1279
+ // - the sampler is NULL
1280
+ // - the sampler is not a llama_sampler_chain
1281
+ // - the index is out of bounds, unless i == -1
1282
+ // - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
1283
+ LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
1284
+
1285
+ // the total number of samplers in the chain
1207
1286
  LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1208
1287
 
1209
1288
  // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -152,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
152
152
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
153
153
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
154
154
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
155
+ { LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
155
156
  { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
156
157
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
157
158
  { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
@@ -2075,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
2075
2076
  LLM_TENSOR_TOKEN_EMBD,
2076
2077
  LLM_TENSOR_OUTPUT_NORM_LFM2,
2077
2078
  LLM_TENSOR_OUTPUT,
2079
+ LLM_TENSOR_DENSE_2_OUT,
2078
2080
  };
2079
2081
  case LLM_ARCH_LFM2MOE:
2080
2082
  return {
@@ -156,6 +156,7 @@ enum llm_kv {
156
156
  LLM_KV_VOCAB_SIZE,
157
157
  LLM_KV_CONTEXT_LENGTH,
158
158
  LLM_KV_EMBEDDING_LENGTH,
159
+ LLM_KV_EMBEDDING_LENGTH_OUT,
159
160
  LLM_KV_FEATURES_LENGTH,
160
161
  LLM_KV_BLOCK_COUNT,
161
162
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,