@fugood/llama.node 1.4.14 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/lib/binding.ts +13 -6
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +8 -3
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +77 -65
  6. package/src/LlamaContext.cpp +31 -34
  7. package/src/llama.cpp/CMakeLists.txt +24 -8
  8. package/src/llama.cpp/common/CMakeLists.txt +15 -34
  9. package/src/llama.cpp/common/arg.cpp +59 -10
  10. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  11. package/src/llama.cpp/common/chat.cpp +356 -34
  12. package/src/llama.cpp/common/chat.h +17 -13
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +30 -25
  15. package/src/llama.cpp/common/debug.cpp +165 -0
  16. package/src/llama.cpp/common/debug.h +43 -0
  17. package/src/llama.cpp/common/download.cpp +12 -342
  18. package/src/llama.cpp/common/download.h +6 -0
  19. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  20. package/src/llama.cpp/common/jinja/caps.h +24 -0
  21. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  22. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  23. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  24. package/src/llama.cpp/common/jinja/parser.h +21 -0
  25. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  26. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  27. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  28. package/src/llama.cpp/common/jinja/string.h +58 -0
  29. package/src/llama.cpp/common/jinja/utils.h +49 -0
  30. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  31. package/src/llama.cpp/common/jinja/value.h +464 -0
  32. package/src/llama.cpp/common/preset.cpp +12 -2
  33. package/src/llama.cpp/common/sampling.cpp +52 -19
  34. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  35. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  39. package/src/llama.cpp/include/llama-cpp.h +3 -1
  40. package/src/llama.cpp/include/llama.h +29 -2
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  43. package/src/llama.cpp/src/llama-adapter.h +1 -3
  44. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  45. package/src/llama.cpp/src/llama-arch.h +1 -0
  46. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  47. package/src/llama.cpp/src/llama-chat.h +1 -0
  48. package/src/llama.cpp/src/llama-context.cpp +232 -144
  49. package/src/llama.cpp/src/llama-context.h +10 -0
  50. package/src/llama.cpp/src/llama-cparams.h +2 -0
  51. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  52. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  53. package/src/llama.cpp/src/llama-hparams.h +38 -1
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  55. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  56. package/src/llama.cpp/src/llama-mmap.cpp +13 -6
  57. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  58. package/src/llama.cpp/src/llama-model.cpp +215 -97
  59. package/src/llama.cpp/src/llama-model.h +3 -2
  60. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
  61. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  62. package/src/llama.cpp/src/llama-vocab.h +1 -0
  63. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  64. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  65. package/src/llama.cpp/src/models/models.h +13 -2
  66. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
1513
1513
  mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
1514
1514
  mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
1515
1515
 
1516
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
1517
- // top_p_bias = (mask * 1e9f) - 1e9f.
1518
- // So entries in the mask that we want to discard will become -1e9f, and
1519
- // others will be 0 (meaning that will not effect the logits).
1520
- const float large_val = 1e9f;
1521
- struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
1516
+ // Apply -INFINITY bias for masked-out tokens
1517
+ // log(1) = 0 (keep), log(0) = -INF (discard)
1518
+ struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
1522
1519
  ggml_set_name(top_p_bias, "top_p_bias");
1523
1520
 
1524
1521
  data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
@@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
1673
1670
  struct ggml_tensor * mask = ggml_step(ctx, sub);
1674
1671
  ggml_set_name(mask, "min_p_mask");
1675
1672
 
1676
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
1677
- // min_p_bias = (mask * 1e9f) - 1e9f.
1678
- // So entries in the mask that we want to discard will become -1e9f, and
1679
- // others will be 0 (meaning that will not effect the logits).
1680
- const float large_val = 1e9f;
1681
- struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
1673
+ // Apply -INFINITY bias for masked-out tokens
1674
+ // log(1) = 0 (keep), log(0) = -INF (discard)
1675
+ struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
1682
1676
  ggml_set_name(min_p_bias, "min_p_bias");
1683
1677
 
1684
- // Add the min_p bias to the logits.
1685
1678
  data->logits = ggml_add(ctx, data->logits, min_p_bias);
1686
1679
  ggml_set_name(data->logits, "min_p_logits");
1687
1680
 
@@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
3293
3286
  return result;
3294
3287
  }
3295
3288
 
3289
+ // adaptive-p sampler state
3290
+ //
3291
+ // maintains an exponential moving average of the *ORIGINAL* probabilities
3292
+ // of selected tokens, used to compute an adapted target at each sampling step.
3293
+ //
3294
+ // see llama.h for a full description of the sampler
3295
+ //
3296
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17927
3297
+ //
3298
+ struct llama_sampler_adaptive_p {
3299
+ const float target; // target probability (0.0 - 1.0; negative = disabled)
3300
+ const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
3301
+ const uint32_t seed; // original RNG seed
3302
+ uint32_t seed_cur; // actual RNG seed
3303
+ std::mt19937 rng; // RNG state
3304
+ float weighted_sum; // sum(p_i * decay^i)
3305
+ float total_weight; // sum(decay^i), converges to 1/(1-decay)
3306
+ std::vector<float> original_probs; // pre-transform probs, cached for EMA update
3307
+ llama_token pending_token_id; // token ID of selected token
3308
+ int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
3309
+ };
3310
+
3311
+ // adaptive probability transformation constants
3312
+ static constexpr float DISTRIBUTION_WIDTH = 0.3f;
3313
+ static constexpr float PEAK_LOGIT_VALUE = 5.0f;
3314
+ static constexpr float SHARPNESS = 10.0f;
3315
+ static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
3316
+
3317
+ static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
3318
+ return "adaptive-p";
3319
+ }
3320
+
3321
+ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
3322
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3323
+
3324
+ llama_sampler_softmax_impl(cur_p, false);
3325
+
3326
+ if (ctx->target < 0.0f) {
3327
+ // at negative target values, adaptive-p is no-op
3328
+ // we simply sample from the existing distribution
3329
+ cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
3330
+ return;
3331
+ }
3332
+
3333
+ // store the original probabilities
3334
+ ctx->original_probs.resize(cur_p->size);
3335
+ for (size_t i = 0; i < cur_p->size; ++i) {
3336
+ ctx->original_probs[i] = cur_p->data[i].p;
3337
+ }
3338
+
3339
+ // using the EMA, compute the adapted target probability for the current sampling step
3340
+ auto target = std::clamp(ctx->target, 0.0f, 1.0f);
3341
+ float adapted_target = std::clamp(
3342
+ ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
3343
+ 0.0f, 1.0f
3344
+ );
3345
+
3346
+ // adaptive probability transform
3347
+ //
3348
+ // quadratic near target for fine differentiation, transitioning to linear decay in the
3349
+ // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
3350
+ // after the softmax.
3351
+ //
3352
+ for (size_t i = 0; i < cur_p->size; ++i) {
3353
+ if (cur_p->data[i].logit == -INFINITY) {
3354
+ // don't transform logits that are -INFINITY
3355
+ // (as masked out by e.g. min-p and top-p when using backend sampling)
3356
+ continue;
3357
+ }
3358
+ float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
3359
+ cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
3360
+ }
3361
+
3362
+ // softmax and sample from the transformed distribution
3363
+ llama_sampler_softmax_impl(cur_p, false);
3364
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
3365
+ cur_p->selected = idx;
3366
+
3367
+ // store the selected token ID for acceptance later
3368
+ ctx->pending_token_id = cur_p->data[idx].id;
3369
+ ctx->pending_token_idx = idx;
3370
+ }
3371
+
3372
+ static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
3373
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3374
+ if (ctx->pending_token_id == token) {
3375
+ GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
3376
+ GGML_ASSERT(ctx->pending_token_idx != -1);
3377
+ // update EMA with the original probability of the selected token
3378
+ ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
3379
+ ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
3380
+ }
3381
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
3382
+ ctx->pending_token_idx = -1;
3383
+ }
3384
+
3385
+ static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
3386
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3387
+ // ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
3388
+ // original_probs is completely overwritten on every call to _apply.
3389
+ // so we only need to reset the EMA state and pending token.
3390
+ ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
3391
+ ctx->total_weight = 1.0f / (1.0f - ctx->decay);
3392
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
3393
+ ctx->pending_token_idx = -1;
3394
+ ctx->seed_cur = get_rng_seed(ctx->seed);
3395
+ ctx->rng.seed(ctx->seed_cur);
3396
+ }
3397
+
3398
+ static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
3399
+ const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
3400
+ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
3401
+ auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
3402
+
3403
+ // copy everything (target, decay, seed, and RNG are already set)
3404
+ result_ctx->weighted_sum = ctx->weighted_sum;
3405
+ result_ctx->total_weight = ctx->total_weight;
3406
+ result_ctx->pending_token_id = ctx->pending_token_id;
3407
+ result_ctx->pending_token_idx = ctx->pending_token_idx;
3408
+
3409
+ return result;
3410
+ }
3411
+
3412
+ static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
3413
+ delete (llama_sampler_adaptive_p *) smpl->ctx;
3414
+ }
3415
+
3416
+ static struct llama_sampler_i llama_sampler_adaptive_p_i = {
3417
+ /* .name = */ llama_sampler_adaptive_p_name,
3418
+ /* .accept = */ llama_sampler_adaptive_p_accept,
3419
+ /* .apply = */ llama_sampler_adaptive_p_apply,
3420
+ /* .reset = */ llama_sampler_adaptive_p_reset,
3421
+ /* .clone = */ llama_sampler_adaptive_p_clone,
3422
+ /* .free = */ llama_sampler_adaptive_p_free,
3423
+ /* .backend_init = */ nullptr,
3424
+ /* .backend_accept = */ nullptr,
3425
+ /* .backend_apply = */ nullptr,
3426
+ /* .backend_set_input = */ nullptr,
3427
+ };
3428
+
3429
+ struct llama_sampler * llama_sampler_init_adaptive_p(
3430
+ float target,
3431
+ float decay,
3432
+ uint32_t seed
3433
+ ) {
3434
+ auto seed_cur = get_rng_seed(seed);
3435
+ float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
3436
+ return llama_sampler_init(
3437
+ /* .iface = */ &llama_sampler_adaptive_p_i,
3438
+ /* .ctx = */ new llama_sampler_adaptive_p {
3439
+ /* .target = */ target,
3440
+ /* .decay = */ clamped_decay,
3441
+ /* .seed = */ seed,
3442
+ /* .seed_cur = */ seed_cur,
3443
+ /* .rng = */ std::mt19937(seed_cur),
3444
+ /* .weighted_sum = */ target / (1.0f - clamped_decay),
3445
+ /* .total_weight = */ 1.0f / (1.0f - clamped_decay),
3446
+ /* .original_probs = */ {},
3447
+ /* .pending_token_id = */ LLAMA_TOKEN_NULL,
3448
+ /* .pending_token_idx = */ -1
3449
+ }
3450
+ );
3451
+ }
3452
+
3296
3453
  // logit-bias
3297
3454
 
3298
3455
  struct llama_sampler_logit_bias : public llama_sampler_backend {
@@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
461
461
  "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
462
462
  };
463
463
  break;
464
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
465
+ regex_exprs = {
466
+ // original regex from tokenizer.json
467
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
468
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
469
+ };
470
+ break;
464
471
  default:
465
472
  // default regex for BPE tokenization pre-processing
466
473
  regex_exprs = {
@@ -1965,6 +1972,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1965
1972
  } else if (
1966
1973
  tokenizer_pre == "exaone4") {
1967
1974
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1975
+ } else if (
1976
+ tokenizer_pre == "exaone-moe") {
1977
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
1968
1978
  } else if (
1969
1979
  tokenizer_pre == "chameleon") {
1970
1980
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -2436,7 +2446,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2436
2446
  auto & attr = id_to_token[t.second].attr;
2437
2447
 
2438
2448
  if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2439
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2449
+ LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
2450
+ __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
2451
+
2452
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2440
2453
  }
2441
2454
  }
2442
2455
 
@@ -2489,7 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2489
2502
  special_eog_ids.erase(end_id);
2490
2503
 
2491
2504
  auto & attr = id_to_token[end_id].attr;
2492
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2505
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2493
2506
 
2494
2507
  LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2495
2508
  }
@@ -3289,34 +3302,34 @@ int32_t llama_vocab::impl::detokenize(
3289
3302
  }
3290
3303
 
3291
3304
  void llama_vocab::impl::print_info() const {
3292
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3293
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3294
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3305
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3306
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3307
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3295
3308
 
3296
3309
  // special tokens
3297
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3298
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3299
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3300
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3301
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3302
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3303
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3304
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3305
-
3306
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3307
-
3308
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3309
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3310
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3311
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3312
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3313
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3310
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3311
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3312
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3313
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3314
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3315
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3316
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3317
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3318
+
3319
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3320
+
3321
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3322
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3323
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3324
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3325
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3326
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3314
3327
 
3315
3328
  for (const auto & id : special_eog_ids) {
3316
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3329
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3317
3330
  }
3318
3331
 
3319
- LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3332
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3320
3333
  }
3321
3334
 
3322
3335
  llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
@@ -53,6 +53,7 @@ enum llama_vocab_pre_type {
53
53
  LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
54
54
  LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
55
55
  LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
56
+ LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
56
57
  };
57
58
 
58
59
  struct LLM_KV;
@@ -0,0 +1,146 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
5
+ llm_graph_context(params) {
6
+ const int64_t n_embd_head = hparams.n_embd_head_k;
7
+
8
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
9
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10
+
11
+ ggml_tensor * cur;
12
+ ggml_tensor * inpL;
13
+
14
+ inpL = build_inp_embd(model.tok_embd);
15
+
16
+ // inp_pos - contains the positions
17
+ ggml_tensor * inp_pos = build_inp_pos();
18
+
19
+ auto * inp_attn_iswa = build_attn_inp_kv_iswa();
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
24
+ for (int il = 0; il < n_transformer_layers; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ // use RoPE for SWA layers
28
+ const bool is_local_layer = hparams.is_swa(il);
29
+
30
+ // norm
31
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
32
+ cb(cur, "attn_norm", il);
33
+
34
+ // self-attention
35
+ {
36
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
37
+
38
+ // compute Q and K and RoPE them
39
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
40
+ cb(Qcur, "Qcur", il);
41
+
42
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
43
+ cb(Kcur, "Kcur", il);
44
+
45
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46
+ cb(Vcur, "Vcur", il);
47
+
48
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
49
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
50
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
51
+
52
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
53
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
54
+ cb(Qcur, "Qcur_normed", il);
55
+ cb(Kcur, "Kcur_normed", il);
56
+
57
+ if (is_local_layer) {
58
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
59
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
60
+
61
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
62
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
63
+ }
64
+ cb(Qcur, "Qcur", il);
65
+ cb(Kcur, "Kcur", il);
66
+ cb(Vcur, "Vcur", il);
67
+
68
+ cur = build_attn(inp_attn_iswa,
69
+ model.layers[il].wo, NULL,
70
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
71
+ cb(cur, "attn_out", il);
72
+ }
73
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
74
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
75
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
76
+ }
77
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
78
+ cb(ffn_inp, "ffn_inp", il);
79
+
80
+ // norm
81
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
82
+ cb(cur, "ffn_norm", il);
83
+
84
+ // feed-forward network
85
+ if (model.layers[il].ffn_gate_inp == nullptr) {
86
+ // dense branch
87
+ cur = build_ffn(cur,
88
+ model.layers[il].ffn_up, NULL, NULL,
89
+ model.layers[il].ffn_gate, NULL, NULL,
90
+ model.layers[il].ffn_down, NULL, NULL, NULL,
91
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
92
+ cb(cur, "ffn_out", il);
93
+ } else {
94
+ // MoE branch
95
+ ggml_tensor * moe_out = build_moe_ffn(cur,
96
+ model.layers[il].ffn_gate_inp,
97
+ model.layers[il].ffn_up_exps,
98
+ model.layers[il].ffn_gate_exps,
99
+ model.layers[il].ffn_down_exps,
100
+ model.layers[il].ffn_exp_probs_b,
101
+ n_expert, n_expert_used,
102
+ LLM_FFN_SILU, hparams.expert_weights_norm,
103
+ true, hparams.expert_weights_scale,
104
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
105
+ il);
106
+ cb(moe_out, "ffn_moe_out", il);
107
+
108
+ // FFN shared expert
109
+ {
110
+ ggml_tensor * ffn_shexp =
111
+ build_ffn(cur,
112
+ model.layers[il].ffn_up_shexp, NULL, NULL,
113
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
114
+ model.layers[il].ffn_down_shexp, NULL, NULL,
115
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
116
+ cb(ffn_shexp, "ffn_shexp", il);
117
+
118
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
119
+ cb(cur, "ffn_out", il);
120
+ }
121
+ }
122
+
123
+ cur = ggml_add(ctx0, cur, ffn_inp);
124
+
125
+ cur = build_cvec(cur, il);
126
+ cb(cur, "l_out", il);
127
+
128
+ // input for next layer
129
+ inpL = cur;
130
+ }
131
+ cur = inpL;
132
+
133
+ // final norm
134
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
135
+
136
+ cb(cur, "result_norm", -1);
137
+ res->t_embd = cur;
138
+
139
+ // lm_head
140
+ cur = build_lora_mm(model.output, cur);
141
+
142
+ cb(cur, "result_output", -1);
143
+ res->t_logits = cur;
144
+
145
+ ggml_build_forward_expand(gf, cur);
146
+ }
@@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
255
255
  inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
256
256
  inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
257
257
  cb(inp_per_layer, "inp_per_layer_selected", -1);
258
+ res->add_input(std::move(inp));
258
259
  } else {
259
- GGML_ABORT("TODO: support embd input");
260
+ // Vision embedding path: use padding token (ID=0) embedding
261
+ // TODO: verify if this is the correct behavior in transformers implementation
262
+ const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
263
+
264
+ // Extract and dequantize padding token embedding (row 0)
265
+ ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
266
+ inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
267
+
268
+ // Reshape to [n_embd_altup, n_layer, 1]
269
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
270
+ cb(inp_per_layer, "inp_per_layer_vision", -1);
260
271
  }
261
- res->add_input(std::move(inp));
262
272
  return inp_per_layer;
263
273
  }
264
274
 
@@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
276
286
  -1); // [n_embd_altup, n_layer, n_tokens]
277
287
  cb(per_layer_proj, "per_layer_proj", -1);
278
288
 
279
- inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
289
+ inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
280
290
  inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
281
291
  cb(inp_per_layer, "inp_per_layer", -1);
282
292
 
@@ -167,6 +167,10 @@ struct llm_build_exaone : public llm_graph_context {
167
167
  llm_build_exaone(const llama_model & model, const llm_graph_params & params);
168
168
  };
169
169
 
170
+ struct llm_build_exaone_moe : public llm_graph_context {
171
+ llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
172
+ };
173
+
170
174
  struct llm_build_falcon : public llm_graph_context {
171
175
  llm_build_falcon(const llama_model & model, const llm_graph_params & params);
172
176
  };
@@ -466,7 +470,8 @@ private:
466
470
  ggml_tensor * cur,
467
471
  int il);
468
472
 
469
- ggml_tensor * build_delta_net_chunking(
473
+ // returns pair of output and new state
474
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
470
475
  ggml_tensor * q,
471
476
  ggml_tensor * k,
472
477
  ggml_tensor * v,
@@ -478,7 +483,8 @@ private:
478
483
  ggml_tensor * diag_mask,
479
484
  int il);
480
485
 
481
- ggml_tensor * build_delta_net_autoregressive(
486
+ // returns pair of output and new state
487
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
482
488
  ggml_tensor * q,
483
489
  ggml_tensor * k,
484
490
  ggml_tensor * v,
@@ -493,6 +499,11 @@ private:
493
499
  ggml_tensor * gate,
494
500
  int layer);
495
501
 
502
+ // returns pair of qkv, z
503
+ std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
504
+ ggml_tensor * input,
505
+ int il);
506
+
496
507
  const llama_model & model;
497
508
  };
498
509