@fugood/llama.node 0.4.7 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +20 -6
- package/lib/index.js +41 -17
- package/lib/index.ts +50 -23
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +9 -9
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +37 -18
- package/src/LlamaContext.h +1 -0
- package/src/TokenizeWorker.cpp +16 -12
- package/src/TokenizeWorker.h +2 -2
- package/src/common.hpp +54 -50
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
|
@@ -463,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
463
463
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
464
464
|
}
|
|
465
465
|
|
|
466
|
-
// zero-out the array hparams
|
|
467
466
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
468
467
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
469
468
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
470
469
|
|
|
470
|
+
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
471
|
+
|
|
472
|
+
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
|
473
|
+
|
|
471
474
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
472
475
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
473
476
|
|
|
@@ -571,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
571
574
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
572
575
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
573
576
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
574
|
-
|
|
575
|
-
hparams.
|
|
576
|
-
hparams.n_swa
|
|
577
|
+
|
|
578
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
579
|
+
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
580
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
577
581
|
|
|
578
582
|
switch (hparams.n_expert) {
|
|
579
583
|
case 16: type = LLM_TYPE_17B_16E; break;
|
|
@@ -852,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
852
856
|
default: type = LLM_TYPE_UNKNOWN;
|
|
853
857
|
}
|
|
854
858
|
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
//
|
|
862
|
-
hparams.
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
hparams.n_swa = 131072;
|
|
867
|
-
}
|
|
868
|
-
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
869
|
-
if (!found_swa && hparams.n_swa == 0) {
|
|
870
|
-
throw std::runtime_error("invalid value for sliding_window");
|
|
859
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
860
|
+
|
|
861
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
862
|
+
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
|
863
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
|
864
|
+
|
|
865
|
+
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
|
866
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
867
|
+
|
|
868
|
+
hparams.n_swa = 0;
|
|
869
|
+
hparams.set_swa_pattern(1);
|
|
871
870
|
}
|
|
872
871
|
} break;
|
|
873
872
|
case LLM_ARCH_PHIMOE:
|
|
@@ -937,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
937
936
|
} break;
|
|
938
937
|
case LLM_ARCH_GEMMA2:
|
|
939
938
|
{
|
|
939
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
940
940
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
941
|
-
hparams.
|
|
941
|
+
hparams.set_swa_pattern(2);
|
|
942
942
|
hparams.attn_soft_cap = true;
|
|
943
943
|
|
|
944
944
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -955,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
955
955
|
} break;
|
|
956
956
|
case LLM_ARCH_GEMMA3:
|
|
957
957
|
{
|
|
958
|
-
hparams.
|
|
958
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
959
|
+
hparams.set_swa_pattern(6);
|
|
959
960
|
|
|
960
961
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
961
962
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
@@ -1039,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1039
1040
|
} break;
|
|
1040
1041
|
case LLM_ARCH_COHERE2:
|
|
1041
1042
|
{
|
|
1042
|
-
hparams.
|
|
1043
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1044
|
+
hparams.set_swa_pattern(4);
|
|
1043
1045
|
|
|
1044
1046
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1045
1047
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
@@ -4321,7 +4323,7 @@ void llama_model::print_info() const {
|
|
|
4321
4323
|
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
4322
4324
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
|
4323
4325
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
4324
|
-
LLAMA_LOG_INFO("%s:
|
|
4326
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
|
4325
4327
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
|
4326
4328
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
|
4327
4329
|
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
@@ -4489,7 +4491,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
|
4489
4491
|
return it->second;
|
|
4490
4492
|
}
|
|
4491
4493
|
|
|
4492
|
-
|
|
4494
|
+
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
|
|
4495
|
+
return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
|
4496
|
+
}
|
|
4497
|
+
|
|
4498
|
+
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
|
|
4499
|
+
return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
|
|
4503
|
+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
4504
|
+
|
|
4493
4505
|
// choose long/short freq factors based on the context size
|
|
4494
4506
|
if (layers[il].rope_freqs != nullptr) {
|
|
4495
4507
|
return layers[il].rope_freqs;
|
|
@@ -4517,21 +4529,174 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4517
4529
|
// inp_pos - contains the positions
|
|
4518
4530
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
4519
4531
|
|
|
4532
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4533
|
+
|
|
4534
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4535
|
+
|
|
4536
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
4537
|
+
ggml_tensor * inpSA = inpL;
|
|
4538
|
+
|
|
4539
|
+
// norm
|
|
4540
|
+
cur = build_norm(inpL,
|
|
4541
|
+
model.layers[il].attn_norm, NULL,
|
|
4542
|
+
LLM_NORM_RMS, il);
|
|
4543
|
+
cb(cur, "attn_norm", il);
|
|
4544
|
+
|
|
4545
|
+
// self-attention
|
|
4546
|
+
{
|
|
4547
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4548
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4549
|
+
|
|
4550
|
+
// compute Q and K and RoPE them
|
|
4551
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
4552
|
+
cb(Qcur, "Qcur", il);
|
|
4553
|
+
if (model.layers[il].bq) {
|
|
4554
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
4555
|
+
cb(Qcur, "Qcur", il);
|
|
4556
|
+
}
|
|
4557
|
+
|
|
4558
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
4559
|
+
cb(Kcur, "Kcur", il);
|
|
4560
|
+
if (model.layers[il].bk) {
|
|
4561
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
4562
|
+
cb(Kcur, "Kcur", il);
|
|
4563
|
+
}
|
|
4564
|
+
|
|
4565
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
4566
|
+
cb(Vcur, "Vcur", il);
|
|
4567
|
+
if (model.layers[il].bv) {
|
|
4568
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
4569
|
+
cb(Vcur, "Vcur", il);
|
|
4570
|
+
}
|
|
4571
|
+
|
|
4572
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
4573
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
4574
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
4575
|
+
|
|
4576
|
+
Qcur = ggml_rope_ext(
|
|
4577
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
4578
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4579
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4580
|
+
);
|
|
4581
|
+
|
|
4582
|
+
Kcur = ggml_rope_ext(
|
|
4583
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
4584
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4585
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4586
|
+
);
|
|
4587
|
+
|
|
4588
|
+
cb(Qcur, "Qcur", il);
|
|
4589
|
+
cb(Kcur, "Kcur", il);
|
|
4590
|
+
cb(Vcur, "Vcur", il);
|
|
4591
|
+
|
|
4592
|
+
cur = build_attn(inp_attn, gf,
|
|
4593
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
4594
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4595
|
+
cb(cur, "attn_out", il);
|
|
4596
|
+
}
|
|
4597
|
+
|
|
4598
|
+
if (il == n_layer - 1) {
|
|
4599
|
+
// skip computing output for unused tokens
|
|
4600
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4601
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4602
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4603
|
+
}
|
|
4604
|
+
|
|
4605
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4606
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
4607
|
+
|
|
4608
|
+
// feed-forward network (non-MoE)
|
|
4609
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4610
|
+
|
|
4611
|
+
cur = build_norm(ffn_inp,
|
|
4612
|
+
model.layers[il].ffn_norm, NULL,
|
|
4613
|
+
LLM_NORM_RMS, il);
|
|
4614
|
+
cb(cur, "ffn_norm", il);
|
|
4615
|
+
|
|
4616
|
+
cur = build_ffn(cur,
|
|
4617
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
4618
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
4619
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
4620
|
+
NULL,
|
|
4621
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4622
|
+
cb(cur, "ffn_out", il);
|
|
4623
|
+
} else {
|
|
4624
|
+
// MoE branch
|
|
4625
|
+
cur = build_norm(ffn_inp,
|
|
4626
|
+
model.layers[il].ffn_norm, NULL,
|
|
4627
|
+
LLM_NORM_RMS, il);
|
|
4628
|
+
cb(cur, "ffn_norm", il);
|
|
4629
|
+
|
|
4630
|
+
cur = build_moe_ffn(cur,
|
|
4631
|
+
model.layers[il].ffn_gate_inp,
|
|
4632
|
+
model.layers[il].ffn_up_exps,
|
|
4633
|
+
model.layers[il].ffn_gate_exps,
|
|
4634
|
+
model.layers[il].ffn_down_exps,
|
|
4635
|
+
nullptr,
|
|
4636
|
+
n_expert, n_expert_used,
|
|
4637
|
+
LLM_FFN_SILU, true,
|
|
4638
|
+
false, 0.0,
|
|
4639
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
4640
|
+
il);
|
|
4641
|
+
cb(cur, "ffn_moe_out", il);
|
|
4642
|
+
}
|
|
4643
|
+
|
|
4644
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4645
|
+
cb(cur, "ffn_out", il);
|
|
4646
|
+
|
|
4647
|
+
cur = build_cvec(cur, il);
|
|
4648
|
+
cb(cur, "l_out", il);
|
|
4649
|
+
|
|
4650
|
+
// input for next layer
|
|
4651
|
+
inpL = cur;
|
|
4652
|
+
}
|
|
4653
|
+
|
|
4654
|
+
cur = inpL;
|
|
4655
|
+
|
|
4656
|
+
cur = build_norm(cur,
|
|
4657
|
+
model.output_norm, NULL,
|
|
4658
|
+
LLM_NORM_RMS, -1);
|
|
4659
|
+
|
|
4660
|
+
cb(cur, "result_norm", -1);
|
|
4661
|
+
res->t_embd = cur;
|
|
4662
|
+
|
|
4663
|
+
// lm_head
|
|
4664
|
+
cur = build_lora_mm(model.output, cur);
|
|
4665
|
+
|
|
4666
|
+
cb(cur, "result_output", -1);
|
|
4667
|
+
res->t_logits = cur;
|
|
4668
|
+
|
|
4669
|
+
ggml_build_forward_expand(gf, cur);
|
|
4670
|
+
}
|
|
4671
|
+
};
|
|
4672
|
+
|
|
4673
|
+
struct llm_build_llama_iswa : public llm_graph_context {
|
|
4674
|
+
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
4675
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4676
|
+
|
|
4677
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4678
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
4679
|
+
|
|
4680
|
+
ggml_tensor * cur;
|
|
4681
|
+
ggml_tensor * inpL;
|
|
4682
|
+
|
|
4683
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
4684
|
+
|
|
4685
|
+
// inp_pos - contains the positions
|
|
4686
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
4687
|
+
|
|
4520
4688
|
// temperature tuning
|
|
4521
4689
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
4522
|
-
|
|
4523
|
-
inp_attn_scale = build_inp_attn_scale();
|
|
4524
|
-
}
|
|
4690
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
4525
4691
|
|
|
4526
|
-
auto * inp_attn =
|
|
4692
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
4527
4693
|
|
|
4528
4694
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4695
|
+
|
|
4529
4696
|
for (int il = 0; il < n_layer; ++il) {
|
|
4530
4697
|
ggml_tensor * inpSA = inpL;
|
|
4531
4698
|
|
|
4532
|
-
bool use_rope =
|
|
4533
|
-
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
|
4534
|
-
: true;
|
|
4699
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
4535
4700
|
|
|
4536
4701
|
// norm
|
|
4537
4702
|
cur = build_norm(inpL,
|
|
@@ -4542,7 +4707,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4542
4707
|
// self-attention
|
|
4543
4708
|
{
|
|
4544
4709
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4545
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
4710
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4546
4711
|
|
|
4547
4712
|
// compute Q and K and RoPE them
|
|
4548
4713
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4590,7 +4755,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4590
4755
|
cb(Kcur, "Kcur", il);
|
|
4591
4756
|
cb(Vcur, "Vcur", il);
|
|
4592
4757
|
|
|
4593
|
-
if (
|
|
4758
|
+
if (use_rope && hparams.use_kq_norm) {
|
|
4594
4759
|
// Llama4TextL2Norm
|
|
4595
4760
|
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
4596
4761
|
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
@@ -4616,7 +4781,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4616
4781
|
|
|
4617
4782
|
// feed-forward network (non-MoE)
|
|
4618
4783
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4619
|
-
|
|
4620
4784
|
cur = build_norm(ffn_inp,
|
|
4621
4785
|
model.layers[il].ffn_norm, NULL,
|
|
4622
4786
|
LLM_NORM_RMS, il);
|
|
@@ -4629,9 +4793,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4629
4793
|
NULL,
|
|
4630
4794
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4631
4795
|
cb(cur, "ffn_out", il);
|
|
4632
|
-
|
|
4633
|
-
} else if (arch == LLM_ARCH_LLAMA4) {
|
|
4634
|
-
// llama4 MoE
|
|
4796
|
+
} else {
|
|
4635
4797
|
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
|
4636
4798
|
model.layers[il].ffn_norm, NULL,
|
|
4637
4799
|
LLM_NORM_RMS, il);
|
|
@@ -4660,26 +4822,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4660
4822
|
|
|
4661
4823
|
cur = ggml_add(ctx0, moe_out, shexp_out);
|
|
4662
4824
|
cb(cur, "ffn_moe_out_merged", il);
|
|
4663
|
-
|
|
4664
|
-
} else {
|
|
4665
|
-
// MoE branch
|
|
4666
|
-
cur = build_norm(ffn_inp,
|
|
4667
|
-
model.layers[il].ffn_norm, NULL,
|
|
4668
|
-
LLM_NORM_RMS, il);
|
|
4669
|
-
cb(cur, "ffn_norm", il);
|
|
4670
|
-
|
|
4671
|
-
cur = build_moe_ffn(cur,
|
|
4672
|
-
model.layers[il].ffn_gate_inp,
|
|
4673
|
-
model.layers[il].ffn_up_exps,
|
|
4674
|
-
model.layers[il].ffn_gate_exps,
|
|
4675
|
-
model.layers[il].ffn_down_exps,
|
|
4676
|
-
nullptr,
|
|
4677
|
-
n_expert, n_expert_used,
|
|
4678
|
-
LLM_FFN_SILU, true,
|
|
4679
|
-
false, 0.0,
|
|
4680
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
4681
|
-
il);
|
|
4682
|
-
cb(cur, "ffn_moe_out", il);
|
|
4683
4825
|
}
|
|
4684
4826
|
|
|
4685
4827
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -4753,7 +4895,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4753
4895
|
} else if (n_head > 0) {
|
|
4754
4896
|
// self-attention
|
|
4755
4897
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4756
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
4898
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4757
4899
|
|
|
4758
4900
|
// compute Q and K and RoPE them
|
|
4759
4901
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -7202,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7202
7344
|
}
|
|
7203
7345
|
};
|
|
7204
7346
|
|
|
7347
|
+
template<bool iswa>
|
|
7205
7348
|
struct llm_build_phi3 : public llm_graph_context {
|
|
7206
7349
|
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7207
7350
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -7217,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7217
7360
|
// inp_pos - contains the positions
|
|
7218
7361
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7219
7362
|
|
|
7220
|
-
|
|
7363
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
7364
|
+
inp_attn_type * inp_attn = nullptr;
|
|
7365
|
+
|
|
7366
|
+
if constexpr (iswa) {
|
|
7367
|
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
|
7368
|
+
} else {
|
|
7369
|
+
inp_attn = build_attn_inp_kv_unified();
|
|
7370
|
+
}
|
|
7221
7371
|
|
|
7222
7372
|
for (int il = 0; il < n_layer; ++il) {
|
|
7223
7373
|
auto * residual = inpL;
|
|
@@ -7225,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7225
7375
|
// self-attention
|
|
7226
7376
|
{
|
|
7227
7377
|
// rope freq factors for 128k context
|
|
7228
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
7378
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
7229
7379
|
|
|
7230
7380
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
7231
7381
|
model.layers[il].attn_norm,
|
|
@@ -7977,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7977
8127
|
for (int il = 0; il < n_layer; ++il) {
|
|
7978
8128
|
ggml_tensor * inpSA = inpL;
|
|
7979
8129
|
|
|
7980
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
8130
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
7981
8131
|
|
|
7982
8132
|
// norm
|
|
7983
8133
|
cur = build_norm(inpL,
|
|
@@ -8277,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8277
8427
|
}
|
|
8278
8428
|
};
|
|
8279
8429
|
|
|
8280
|
-
struct
|
|
8281
|
-
|
|
8430
|
+
struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
8431
|
+
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8282
8432
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
8283
8433
|
|
|
8284
8434
|
ggml_tensor * cur;
|
|
@@ -8292,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
8292
8442
|
// inp_pos - contains the positions
|
|
8293
8443
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8294
8444
|
|
|
8295
|
-
auto * inp_attn =
|
|
8445
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8296
8446
|
|
|
8297
8447
|
for (int il = 0; il < n_layer; ++il) {
|
|
8298
8448
|
// norm
|
|
@@ -8414,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
8414
8564
|
}
|
|
8415
8565
|
};
|
|
8416
8566
|
|
|
8417
|
-
struct
|
|
8418
|
-
|
|
8567
|
+
struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
8568
|
+
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8419
8569
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
8420
8570
|
|
|
8421
8571
|
ggml_tensor * cur;
|
|
@@ -8433,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
|
8433
8583
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8434
8584
|
|
|
8435
8585
|
// TODO: is causal == true correct? might need some changes
|
|
8436
|
-
auto * inp_attn =
|
|
8586
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8437
8587
|
|
|
8438
8588
|
for (int il = 0; il < n_layer; ++il) {
|
|
8439
|
-
const
|
|
8440
|
-
|
|
8441
|
-
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
|
8442
|
-
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
|
8589
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
8590
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
8443
8591
|
|
|
8444
8592
|
// norm
|
|
8445
8593
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
@@ -9016,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9016
9164
|
}
|
|
9017
9165
|
};
|
|
9018
9166
|
|
|
9019
|
-
struct
|
|
9020
|
-
|
|
9167
|
+
struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
9168
|
+
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9021
9169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
9022
9170
|
|
|
9023
9171
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -9032,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
9032
9180
|
// inp_pos - contains the positions
|
|
9033
9181
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9034
9182
|
|
|
9035
|
-
auto * inp_attn =
|
|
9183
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9036
9184
|
|
|
9037
9185
|
for (int il = 0; il < n_layer; ++il) {
|
|
9038
9186
|
const bool is_swa = hparams.is_swa(il);
|
|
@@ -9045,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
9045
9193
|
// self-attention
|
|
9046
9194
|
{
|
|
9047
9195
|
// rope freq factors for 128k context
|
|
9048
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
9196
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
9049
9197
|
|
|
9050
9198
|
// compute Q and K and RoPE them
|
|
9051
9199
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -9983,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9983
10131
|
// self-attention
|
|
9984
10132
|
{
|
|
9985
10133
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
9986
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
10134
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
9987
10135
|
|
|
9988
10136
|
// compute Q and K and RoPE them
|
|
9989
10137
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -11347,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11347
11495
|
// self-attention
|
|
11348
11496
|
{
|
|
11349
11497
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
11350
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
11498
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
11351
11499
|
|
|
11352
11500
|
// compute Q and K and RoPE them
|
|
11353
11501
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -12263,7 +12411,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12263
12411
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12264
12412
|
|
|
12265
12413
|
if (use_rope) {
|
|
12266
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
12414
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
12267
12415
|
Qcur = ggml_rope_ext(
|
|
12268
12416
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
12269
12417
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -12916,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
12916
13064
|
// self-attention
|
|
12917
13065
|
{
|
|
12918
13066
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12919
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
13067
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
12920
13068
|
|
|
12921
13069
|
// compute Q and K and RoPE them
|
|
12922
13070
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -13044,6 +13192,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
13044
13192
|
case LLM_ARCH_JINA_BERT_V2:
|
|
13045
13193
|
case LLM_ARCH_NOMIC_BERT:
|
|
13046
13194
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13195
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13047
13196
|
{
|
|
13048
13197
|
res = nullptr;
|
|
13049
13198
|
} break;
|
|
@@ -13058,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
13058
13207
|
GGML_TYPE_F32,
|
|
13059
13208
|
GGML_TYPE_F32,
|
|
13060
13209
|
cparams.offload_kqv,
|
|
13061
|
-
std::max((uint32_t) 1, cparams.n_seq_max)
|
|
13210
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13211
|
+
cparams.n_seq_max);
|
|
13062
13212
|
} break;
|
|
13063
13213
|
default:
|
|
13064
13214
|
{
|
|
@@ -13068,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
13068
13218
|
|
|
13069
13219
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13070
13220
|
|
|
13071
|
-
|
|
13072
|
-
|
|
13073
|
-
|
|
13074
|
-
|
|
13075
|
-
|
|
13076
|
-
|
|
13077
|
-
|
|
13078
|
-
|
|
13221
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13222
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
13223
|
+
|
|
13224
|
+
res = new llama_kv_cache_unified_iswa(
|
|
13225
|
+
*this,
|
|
13226
|
+
params.type_k,
|
|
13227
|
+
params.type_v,
|
|
13228
|
+
!cparams.flash_attn,
|
|
13229
|
+
cparams.offload_kqv,
|
|
13230
|
+
params.swa_full,
|
|
13231
|
+
cparams.n_ctx,
|
|
13232
|
+
cparams.n_seq_max,
|
|
13233
|
+
cparams.n_batch,
|
|
13234
|
+
padding);
|
|
13235
|
+
} else {
|
|
13236
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
13237
|
+
|
|
13238
|
+
res = new llama_kv_cache_unified(
|
|
13239
|
+
*this,
|
|
13240
|
+
nullptr,
|
|
13241
|
+
params.type_k,
|
|
13242
|
+
params.type_v,
|
|
13243
|
+
!cparams.flash_attn,
|
|
13244
|
+
cparams.offload_kqv,
|
|
13245
|
+
cparams.n_ctx,
|
|
13246
|
+
cparams.n_seq_max,
|
|
13247
|
+
padding,
|
|
13248
|
+
hparams.n_swa,
|
|
13249
|
+
hparams.swa_type);
|
|
13250
|
+
}
|
|
13079
13251
|
}
|
|
13080
13252
|
}
|
|
13081
13253
|
|
|
@@ -13090,11 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13090
13262
|
|
|
13091
13263
|
switch (arch) {
|
|
13092
13264
|
case LLM_ARCH_LLAMA:
|
|
13093
|
-
case LLM_ARCH_LLAMA4:
|
|
13094
13265
|
case LLM_ARCH_MINICPM:
|
|
13095
13266
|
{
|
|
13096
13267
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
13097
13268
|
} break;
|
|
13269
|
+
case LLM_ARCH_LLAMA4:
|
|
13270
|
+
{
|
|
13271
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
|
|
13272
|
+
} break;
|
|
13098
13273
|
case LLM_ARCH_DECI:
|
|
13099
13274
|
{
|
|
13100
13275
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
|
@@ -13169,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13169
13344
|
case LLM_ARCH_PHI3:
|
|
13170
13345
|
case LLM_ARCH_PHIMOE:
|
|
13171
13346
|
{
|
|
13172
|
-
|
|
13347
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13348
|
+
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
|
13349
|
+
} else {
|
|
13350
|
+
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
|
13351
|
+
}
|
|
13173
13352
|
} break;
|
|
13174
13353
|
case LLM_ARCH_PLAMO:
|
|
13175
13354
|
{
|
|
@@ -13201,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13201
13380
|
} break;
|
|
13202
13381
|
case LLM_ARCH_GEMMA2:
|
|
13203
13382
|
{
|
|
13204
|
-
llm = std::make_unique<
|
|
13383
|
+
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
|
|
13205
13384
|
} break;
|
|
13206
13385
|
case LLM_ARCH_GEMMA3:
|
|
13207
13386
|
{
|
|
13208
|
-
llm = std::make_unique<
|
|
13387
|
+
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
|
13209
13388
|
} break;
|
|
13210
13389
|
case LLM_ARCH_STARCODER2:
|
|
13211
13390
|
{
|
|
@@ -13225,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13225
13404
|
} break;
|
|
13226
13405
|
case LLM_ARCH_COHERE2:
|
|
13227
13406
|
{
|
|
13228
|
-
llm = std::make_unique<
|
|
13407
|
+
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
|
|
13229
13408
|
} break;
|
|
13230
13409
|
case LLM_ARCH_DBRX:
|
|
13231
13410
|
{
|
|
@@ -398,7 +398,10 @@ struct llama_model {
|
|
|
398
398
|
|
|
399
399
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
400
400
|
|
|
401
|
-
|
|
401
|
+
float get_rope_freq_base (const llama_cparams & cparams, int il) const;
|
|
402
|
+
float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
|
|
403
|
+
|
|
404
|
+
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
|
402
405
|
|
|
403
406
|
// note: can mutate `cparams`
|
|
404
407
|
// TODO: move this to new llm_arch_model_i interface
|
|
@@ -128,7 +128,7 @@ int main(void) {
|
|
|
128
128
|
|
|
129
129
|
if (common_has_curl()) {
|
|
130
130
|
printf("test-arg-parser: test curl-related functions\n\n");
|
|
131
|
-
const char * GOOD_URL = "https://
|
|
131
|
+
const char * GOOD_URL = "https://ggml.ai/";
|
|
132
132
|
const char * BAD_URL = "https://www.google.com/404";
|
|
133
133
|
const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
|
|
134
134
|
|