@fugood/llama.node 1.4.14 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -6
- package/lib/index.js +2 -2
- package/lib/index.ts +8 -3
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +77 -65
- package/src/LlamaContext.cpp +31 -34
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +15 -34
- package/src/llama.cpp/common/arg.cpp +59 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +356 -34
- package/src/llama.cpp/common/chat.h +17 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +30 -25
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +13 -6
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +215 -97
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
|
|
|
1513
1513
|
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
|
1514
1514
|
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
|
1515
1515
|
|
|
1516
|
-
//
|
|
1517
|
-
//
|
|
1518
|
-
|
|
1519
|
-
// others will be 0 (meaning that will not effect the logits).
|
|
1520
|
-
const float large_val = 1e9f;
|
|
1521
|
-
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1516
|
+
// Apply -INFINITY bias for masked-out tokens
|
|
1517
|
+
// log(1) = 0 (keep), log(0) = -INF (discard)
|
|
1518
|
+
struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
|
|
1522
1519
|
ggml_set_name(top_p_bias, "top_p_bias");
|
|
1523
1520
|
|
|
1524
1521
|
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
|
@@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
|
|
|
1673
1670
|
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
|
1674
1671
|
ggml_set_name(mask, "min_p_mask");
|
|
1675
1672
|
|
|
1676
|
-
//
|
|
1677
|
-
//
|
|
1678
|
-
|
|
1679
|
-
// others will be 0 (meaning that will not effect the logits).
|
|
1680
|
-
const float large_val = 1e9f;
|
|
1681
|
-
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1673
|
+
// Apply -INFINITY bias for masked-out tokens
|
|
1674
|
+
// log(1) = 0 (keep), log(0) = -INF (discard)
|
|
1675
|
+
struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
|
|
1682
1676
|
ggml_set_name(min_p_bias, "min_p_bias");
|
|
1683
1677
|
|
|
1684
|
-
// Add the min_p bias to the logits.
|
|
1685
1678
|
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
|
1686
1679
|
ggml_set_name(data->logits, "min_p_logits");
|
|
1687
1680
|
|
|
@@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|
|
3293
3286
|
return result;
|
|
3294
3287
|
}
|
|
3295
3288
|
|
|
3289
|
+
// adaptive-p sampler state
|
|
3290
|
+
//
|
|
3291
|
+
// maintains an exponential moving average of the *ORIGINAL* probabilities
|
|
3292
|
+
// of selected tokens, used to compute an adapted target at each sampling step.
|
|
3293
|
+
//
|
|
3294
|
+
// see llama.h for a full description of the sampler
|
|
3295
|
+
//
|
|
3296
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
|
3297
|
+
//
|
|
3298
|
+
struct llama_sampler_adaptive_p {
|
|
3299
|
+
const float target; // target probability (0.0 - 1.0; negative = disabled)
|
|
3300
|
+
const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
|
|
3301
|
+
const uint32_t seed; // original RNG seed
|
|
3302
|
+
uint32_t seed_cur; // actual RNG seed
|
|
3303
|
+
std::mt19937 rng; // RNG state
|
|
3304
|
+
float weighted_sum; // sum(p_i * decay^i)
|
|
3305
|
+
float total_weight; // sum(decay^i), converges to 1/(1-decay)
|
|
3306
|
+
std::vector<float> original_probs; // pre-transform probs, cached for EMA update
|
|
3307
|
+
llama_token pending_token_id; // token ID of selected token
|
|
3308
|
+
int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
|
|
3309
|
+
};
|
|
3310
|
+
|
|
3311
|
+
// adaptive probability transformation constants
|
|
3312
|
+
static constexpr float DISTRIBUTION_WIDTH = 0.3f;
|
|
3313
|
+
static constexpr float PEAK_LOGIT_VALUE = 5.0f;
|
|
3314
|
+
static constexpr float SHARPNESS = 10.0f;
|
|
3315
|
+
static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
|
|
3316
|
+
|
|
3317
|
+
static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
|
|
3318
|
+
return "adaptive-p";
|
|
3319
|
+
}
|
|
3320
|
+
|
|
3321
|
+
static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
3322
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3323
|
+
|
|
3324
|
+
llama_sampler_softmax_impl(cur_p, false);
|
|
3325
|
+
|
|
3326
|
+
if (ctx->target < 0.0f) {
|
|
3327
|
+
// at negative target values, adaptive-p is no-op
|
|
3328
|
+
// we simply sample from the existing distribution
|
|
3329
|
+
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
|
3330
|
+
return;
|
|
3331
|
+
}
|
|
3332
|
+
|
|
3333
|
+
// store the original probabilities
|
|
3334
|
+
ctx->original_probs.resize(cur_p->size);
|
|
3335
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
3336
|
+
ctx->original_probs[i] = cur_p->data[i].p;
|
|
3337
|
+
}
|
|
3338
|
+
|
|
3339
|
+
// using the EMA, compute the adapted target probability for the current sampling step
|
|
3340
|
+
auto target = std::clamp(ctx->target, 0.0f, 1.0f);
|
|
3341
|
+
float adapted_target = std::clamp(
|
|
3342
|
+
ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
|
|
3343
|
+
0.0f, 1.0f
|
|
3344
|
+
);
|
|
3345
|
+
|
|
3346
|
+
// adaptive probability transform
|
|
3347
|
+
//
|
|
3348
|
+
// quadratic near target for fine differentiation, transitioning to linear decay in the
|
|
3349
|
+
// tails. unbounded negative logits ensure proper suppression of far-from-target tokens
|
|
3350
|
+
// after the softmax.
|
|
3351
|
+
//
|
|
3352
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
3353
|
+
if (cur_p->data[i].logit == -INFINITY) {
|
|
3354
|
+
// don't transform logits that are -INFINITY
|
|
3355
|
+
// (as masked out by e.g. min-p and top-p when using backend sampling)
|
|
3356
|
+
continue;
|
|
3357
|
+
}
|
|
3358
|
+
float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
|
|
3359
|
+
cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
|
|
3360
|
+
}
|
|
3361
|
+
|
|
3362
|
+
// softmax and sample from the transformed distribution
|
|
3363
|
+
llama_sampler_softmax_impl(cur_p, false);
|
|
3364
|
+
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
|
3365
|
+
cur_p->selected = idx;
|
|
3366
|
+
|
|
3367
|
+
// store the selected token ID for acceptance later
|
|
3368
|
+
ctx->pending_token_id = cur_p->data[idx].id;
|
|
3369
|
+
ctx->pending_token_idx = idx;
|
|
3370
|
+
}
|
|
3371
|
+
|
|
3372
|
+
static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
|
|
3373
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3374
|
+
if (ctx->pending_token_id == token) {
|
|
3375
|
+
GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
|
|
3376
|
+
GGML_ASSERT(ctx->pending_token_idx != -1);
|
|
3377
|
+
// update EMA with the original probability of the selected token
|
|
3378
|
+
ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
|
|
3379
|
+
ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
|
|
3380
|
+
}
|
|
3381
|
+
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
|
3382
|
+
ctx->pending_token_idx = -1;
|
|
3383
|
+
}
|
|
3384
|
+
|
|
3385
|
+
static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
|
|
3386
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3387
|
+
// ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
|
|
3388
|
+
// original_probs is completely overwritten on every call to _apply.
|
|
3389
|
+
// so we only need to reset the EMA state and pending token.
|
|
3390
|
+
ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
|
|
3391
|
+
ctx->total_weight = 1.0f / (1.0f - ctx->decay);
|
|
3392
|
+
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
|
3393
|
+
ctx->pending_token_idx = -1;
|
|
3394
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
3395
|
+
ctx->rng.seed(ctx->seed_cur);
|
|
3396
|
+
}
|
|
3397
|
+
|
|
3398
|
+
static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
|
|
3399
|
+
const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
|
|
3400
|
+
auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
|
|
3401
|
+
auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
|
|
3402
|
+
|
|
3403
|
+
// copy everything (target, decay, seed, and RNG are already set)
|
|
3404
|
+
result_ctx->weighted_sum = ctx->weighted_sum;
|
|
3405
|
+
result_ctx->total_weight = ctx->total_weight;
|
|
3406
|
+
result_ctx->pending_token_id = ctx->pending_token_id;
|
|
3407
|
+
result_ctx->pending_token_idx = ctx->pending_token_idx;
|
|
3408
|
+
|
|
3409
|
+
return result;
|
|
3410
|
+
}
|
|
3411
|
+
|
|
3412
|
+
static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
|
|
3413
|
+
delete (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3414
|
+
}
|
|
3415
|
+
|
|
3416
|
+
static struct llama_sampler_i llama_sampler_adaptive_p_i = {
|
|
3417
|
+
/* .name = */ llama_sampler_adaptive_p_name,
|
|
3418
|
+
/* .accept = */ llama_sampler_adaptive_p_accept,
|
|
3419
|
+
/* .apply = */ llama_sampler_adaptive_p_apply,
|
|
3420
|
+
/* .reset = */ llama_sampler_adaptive_p_reset,
|
|
3421
|
+
/* .clone = */ llama_sampler_adaptive_p_clone,
|
|
3422
|
+
/* .free = */ llama_sampler_adaptive_p_free,
|
|
3423
|
+
/* .backend_init = */ nullptr,
|
|
3424
|
+
/* .backend_accept = */ nullptr,
|
|
3425
|
+
/* .backend_apply = */ nullptr,
|
|
3426
|
+
/* .backend_set_input = */ nullptr,
|
|
3427
|
+
};
|
|
3428
|
+
|
|
3429
|
+
struct llama_sampler * llama_sampler_init_adaptive_p(
|
|
3430
|
+
float target,
|
|
3431
|
+
float decay,
|
|
3432
|
+
uint32_t seed
|
|
3433
|
+
) {
|
|
3434
|
+
auto seed_cur = get_rng_seed(seed);
|
|
3435
|
+
float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
|
|
3436
|
+
return llama_sampler_init(
|
|
3437
|
+
/* .iface = */ &llama_sampler_adaptive_p_i,
|
|
3438
|
+
/* .ctx = */ new llama_sampler_adaptive_p {
|
|
3439
|
+
/* .target = */ target,
|
|
3440
|
+
/* .decay = */ clamped_decay,
|
|
3441
|
+
/* .seed = */ seed,
|
|
3442
|
+
/* .seed_cur = */ seed_cur,
|
|
3443
|
+
/* .rng = */ std::mt19937(seed_cur),
|
|
3444
|
+
/* .weighted_sum = */ target / (1.0f - clamped_decay),
|
|
3445
|
+
/* .total_weight = */ 1.0f / (1.0f - clamped_decay),
|
|
3446
|
+
/* .original_probs = */ {},
|
|
3447
|
+
/* .pending_token_id = */ LLAMA_TOKEN_NULL,
|
|
3448
|
+
/* .pending_token_idx = */ -1
|
|
3449
|
+
}
|
|
3450
|
+
);
|
|
3451
|
+
}
|
|
3452
|
+
|
|
3296
3453
|
// logit-bias
|
|
3297
3454
|
|
|
3298
3455
|
struct llama_sampler_logit_bias : public llama_sampler_backend {
|
|
@@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
461
461
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
462
462
|
};
|
|
463
463
|
break;
|
|
464
|
+
case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
|
|
465
|
+
regex_exprs = {
|
|
466
|
+
// original regex from tokenizer.json
|
|
467
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
|
468
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
469
|
+
};
|
|
470
|
+
break;
|
|
464
471
|
default:
|
|
465
472
|
// default regex for BPE tokenization pre-processing
|
|
466
473
|
regex_exprs = {
|
|
@@ -1965,6 +1972,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1965
1972
|
} else if (
|
|
1966
1973
|
tokenizer_pre == "exaone4") {
|
|
1967
1974
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1975
|
+
} else if (
|
|
1976
|
+
tokenizer_pre == "exaone-moe") {
|
|
1977
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
|
|
1968
1978
|
} else if (
|
|
1969
1979
|
tokenizer_pre == "chameleon") {
|
|
1970
1980
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
|
@@ -2436,7 +2446,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2436
2446
|
auto & attr = id_to_token[t.second].attr;
|
|
2437
2447
|
|
|
2438
2448
|
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
|
2439
|
-
|
|
2449
|
+
LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
|
|
2450
|
+
__func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
|
|
2451
|
+
|
|
2452
|
+
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2440
2453
|
}
|
|
2441
2454
|
}
|
|
2442
2455
|
|
|
@@ -2489,7 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2489
2502
|
special_eog_ids.erase(end_id);
|
|
2490
2503
|
|
|
2491
2504
|
auto & attr = id_to_token[end_id].attr;
|
|
2492
|
-
attr =
|
|
2505
|
+
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2493
2506
|
|
|
2494
2507
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2495
2508
|
}
|
|
@@ -3289,34 +3302,34 @@ int32_t llama_vocab::impl::detokenize(
|
|
|
3289
3302
|
}
|
|
3290
3303
|
|
|
3291
3304
|
void llama_vocab::impl::print_info() const {
|
|
3292
|
-
LLAMA_LOG_INFO("%s: vocab type
|
|
3293
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
|
3294
|
-
LLAMA_LOG_INFO("%s: n_merges
|
|
3305
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
|
|
3306
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
|
|
3307
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
|
3295
3308
|
|
|
3296
3309
|
// special tokens
|
|
3297
|
-
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token
|
|
3298
|
-
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token
|
|
3299
|
-
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token
|
|
3300
|
-
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token
|
|
3301
|
-
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token
|
|
3302
|
-
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token
|
|
3303
|
-
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token
|
|
3304
|
-
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token
|
|
3305
|
-
|
|
3306
|
-
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token
|
|
3307
|
-
|
|
3308
|
-
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token
|
|
3309
|
-
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token
|
|
3310
|
-
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token
|
|
3311
|
-
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token
|
|
3312
|
-
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token
|
|
3313
|
-
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token
|
|
3310
|
+
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
|
3311
|
+
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
|
3312
|
+
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
|
3313
|
+
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
|
3314
|
+
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
|
3315
|
+
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
|
3316
|
+
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
|
3317
|
+
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
|
3318
|
+
|
|
3319
|
+
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
|
3320
|
+
|
|
3321
|
+
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
|
3322
|
+
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
|
3323
|
+
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
|
3324
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
|
3325
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
|
3326
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
|
3314
3327
|
|
|
3315
3328
|
for (const auto & id : special_eog_ids) {
|
|
3316
|
-
LLAMA_LOG_INFO( "%s: EOG token
|
|
3329
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
|
3317
3330
|
}
|
|
3318
3331
|
|
|
3319
|
-
LLAMA_LOG_INFO("%s: max token length
|
|
3332
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
|
3320
3333
|
}
|
|
3321
3334
|
|
|
3322
3335
|
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
|
|
5
|
+
llm_graph_context(params) {
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
7
|
+
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
|
9
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10
|
+
|
|
11
|
+
ggml_tensor * cur;
|
|
12
|
+
ggml_tensor * inpL;
|
|
13
|
+
|
|
14
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15
|
+
|
|
16
|
+
// inp_pos - contains the positions
|
|
17
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
18
|
+
|
|
19
|
+
auto * inp_attn_iswa = build_attn_inp_kv_iswa();
|
|
20
|
+
|
|
21
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
22
|
+
|
|
23
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
24
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
25
|
+
ggml_tensor * inpSA = inpL;
|
|
26
|
+
|
|
27
|
+
// use RoPE for SWA layers
|
|
28
|
+
const bool is_local_layer = hparams.is_swa(il);
|
|
29
|
+
|
|
30
|
+
// norm
|
|
31
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
32
|
+
cb(cur, "attn_norm", il);
|
|
33
|
+
|
|
34
|
+
// self-attention
|
|
35
|
+
{
|
|
36
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
37
|
+
|
|
38
|
+
// compute Q and K and RoPE them
|
|
39
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
40
|
+
cb(Qcur, "Qcur", il);
|
|
41
|
+
|
|
42
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
43
|
+
cb(Kcur, "Kcur", il);
|
|
44
|
+
|
|
45
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
46
|
+
cb(Vcur, "Vcur", il);
|
|
47
|
+
|
|
48
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
49
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
50
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
51
|
+
|
|
52
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
53
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
54
|
+
cb(Qcur, "Qcur_normed", il);
|
|
55
|
+
cb(Kcur, "Kcur_normed", il);
|
|
56
|
+
|
|
57
|
+
if (is_local_layer) {
|
|
58
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
|
|
59
|
+
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
60
|
+
|
|
61
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
|
|
62
|
+
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
63
|
+
}
|
|
64
|
+
cb(Qcur, "Qcur", il);
|
|
65
|
+
cb(Kcur, "Kcur", il);
|
|
66
|
+
cb(Vcur, "Vcur", il);
|
|
67
|
+
|
|
68
|
+
cur = build_attn(inp_attn_iswa,
|
|
69
|
+
model.layers[il].wo, NULL,
|
|
70
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
71
|
+
cb(cur, "attn_out", il);
|
|
72
|
+
}
|
|
73
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
74
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
75
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
76
|
+
}
|
|
77
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
78
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
79
|
+
|
|
80
|
+
// norm
|
|
81
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
82
|
+
cb(cur, "ffn_norm", il);
|
|
83
|
+
|
|
84
|
+
// feed-forward network
|
|
85
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
86
|
+
// dense branch
|
|
87
|
+
cur = build_ffn(cur,
|
|
88
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
89
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
90
|
+
model.layers[il].ffn_down, NULL, NULL, NULL,
|
|
91
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
92
|
+
cb(cur, "ffn_out", il);
|
|
93
|
+
} else {
|
|
94
|
+
// MoE branch
|
|
95
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
96
|
+
model.layers[il].ffn_gate_inp,
|
|
97
|
+
model.layers[il].ffn_up_exps,
|
|
98
|
+
model.layers[il].ffn_gate_exps,
|
|
99
|
+
model.layers[il].ffn_down_exps,
|
|
100
|
+
model.layers[il].ffn_exp_probs_b,
|
|
101
|
+
n_expert, n_expert_used,
|
|
102
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
103
|
+
true, hparams.expert_weights_scale,
|
|
104
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
105
|
+
il);
|
|
106
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
107
|
+
|
|
108
|
+
// FFN shared expert
|
|
109
|
+
{
|
|
110
|
+
ggml_tensor * ffn_shexp =
|
|
111
|
+
build_ffn(cur,
|
|
112
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
113
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
114
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
115
|
+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
116
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
117
|
+
|
|
118
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
119
|
+
cb(cur, "ffn_out", il);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
124
|
+
|
|
125
|
+
cur = build_cvec(cur, il);
|
|
126
|
+
cb(cur, "l_out", il);
|
|
127
|
+
|
|
128
|
+
// input for next layer
|
|
129
|
+
inpL = cur;
|
|
130
|
+
}
|
|
131
|
+
cur = inpL;
|
|
132
|
+
|
|
133
|
+
// final norm
|
|
134
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
135
|
+
|
|
136
|
+
cb(cur, "result_norm", -1);
|
|
137
|
+
res->t_embd = cur;
|
|
138
|
+
|
|
139
|
+
// lm_head
|
|
140
|
+
cur = build_lora_mm(model.output, cur);
|
|
141
|
+
|
|
142
|
+
cb(cur, "result_output", -1);
|
|
143
|
+
res->t_logits = cur;
|
|
144
|
+
|
|
145
|
+
ggml_build_forward_expand(gf, cur);
|
|
146
|
+
}
|
|
@@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|
|
255
255
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
|
256
256
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
|
257
257
|
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
|
258
|
+
res->add_input(std::move(inp));
|
|
258
259
|
} else {
|
|
259
|
-
|
|
260
|
+
// Vision embedding path: use padding token (ID=0) embedding
|
|
261
|
+
// TODO: verify if this is the correct behavior in transformers implementation
|
|
262
|
+
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
|
263
|
+
|
|
264
|
+
// Extract and dequantize padding token embedding (row 0)
|
|
265
|
+
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
|
266
|
+
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
|
267
|
+
|
|
268
|
+
// Reshape to [n_embd_altup, n_layer, 1]
|
|
269
|
+
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
|
270
|
+
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
|
260
271
|
}
|
|
261
|
-
res->add_input(std::move(inp));
|
|
262
272
|
return inp_per_layer;
|
|
263
273
|
}
|
|
264
274
|
|
|
@@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
|
|
|
276
286
|
-1); // [n_embd_altup, n_layer, n_tokens]
|
|
277
287
|
cb(per_layer_proj, "per_layer_proj", -1);
|
|
278
288
|
|
|
279
|
-
inp_per_layer = ggml_add(ctx0,
|
|
289
|
+
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
|
280
290
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
|
281
291
|
cb(inp_per_layer, "inp_per_layer", -1);
|
|
282
292
|
|
|
@@ -167,6 +167,10 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
167
167
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params);
|
|
168
168
|
};
|
|
169
169
|
|
|
170
|
+
struct llm_build_exaone_moe : public llm_graph_context {
|
|
171
|
+
llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
|
|
172
|
+
};
|
|
173
|
+
|
|
170
174
|
struct llm_build_falcon : public llm_graph_context {
|
|
171
175
|
llm_build_falcon(const llama_model & model, const llm_graph_params & params);
|
|
172
176
|
};
|
|
@@ -466,7 +470,8 @@ private:
|
|
|
466
470
|
ggml_tensor * cur,
|
|
467
471
|
int il);
|
|
468
472
|
|
|
469
|
-
|
|
473
|
+
// returns pair of output and new state
|
|
474
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
|
|
470
475
|
ggml_tensor * q,
|
|
471
476
|
ggml_tensor * k,
|
|
472
477
|
ggml_tensor * v,
|
|
@@ -478,7 +483,8 @@ private:
|
|
|
478
483
|
ggml_tensor * diag_mask,
|
|
479
484
|
int il);
|
|
480
485
|
|
|
481
|
-
|
|
486
|
+
// returns pair of output and new state
|
|
487
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
|
|
482
488
|
ggml_tensor * q,
|
|
483
489
|
ggml_tensor * k,
|
|
484
490
|
ggml_tensor * v,
|
|
@@ -493,6 +499,11 @@ private:
|
|
|
493
499
|
ggml_tensor * gate,
|
|
494
500
|
int layer);
|
|
495
501
|
|
|
502
|
+
// returns pair of qkv, z
|
|
503
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
|
|
504
|
+
ggml_tensor * input,
|
|
505
|
+
int il);
|
|
506
|
+
|
|
496
507
|
const llama_model & model;
|
|
497
508
|
};
|
|
498
509
|
|