@fugood/llama.node 1.1.7 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/LlamaContext.cpp +9 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +8 -1
- package/src/llama.cpp/common/chat.cpp +0 -1
- package/src/llama.cpp/common/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/src/llama-context.cpp +0 -5
- package/src/llama.cpp/src/llama-model.cpp +80 -70
- package/src/llama.cpp/src/llama-model.h +1 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.8",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.8",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.8",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.8",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.8",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.8",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.8",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.8",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.8",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.8",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.8",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.8"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -636,6 +636,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
636
636
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
637
637
|
parallel_tool_calls, tool_choice, enable_thinking,
|
|
638
638
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
639
|
+
} catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
|
|
640
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
641
|
+
return env.Undefined();
|
|
642
|
+
} catch (const std::invalid_argument& e) {
|
|
643
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
644
|
+
return env.Undefined();
|
|
645
|
+
} catch (const std::runtime_error& e) {
|
|
646
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
647
|
+
return env.Undefined();
|
|
639
648
|
} catch (const std::exception &e) {
|
|
640
649
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
641
650
|
return env.Undefined();
|
package/src/common.hpp
CHANGED
|
@@ -461,7 +461,14 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
// Clear all KV cache entries after position n_past
|
|
464
|
-
|
|
464
|
+
auto * kv = llama_get_memory(ctx);
|
|
465
|
+
bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
|
|
466
|
+
if (!clear_result) {
|
|
467
|
+
fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
|
|
468
|
+
llama_memory_clear(kv, false);
|
|
469
|
+
n_past = 0;
|
|
470
|
+
new_n_past = n_past;
|
|
471
|
+
}
|
|
465
472
|
|
|
466
473
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
467
474
|
|
|
@@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1530
1530
|
params.ctx_shift = false;
|
|
1531
1531
|
}
|
|
1532
1532
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1533
|
+
add_opt(common_arg(
|
|
1534
|
+
{"--context-shift"},
|
|
1535
|
+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
1536
|
+
[](common_params & params) {
|
|
1537
|
+
params.ctx_shift = true;
|
|
1538
|
+
}
|
|
1539
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
1533
1540
|
add_opt(common_arg(
|
|
1534
1541
|
{"--chunks"}, "N",
|
|
1535
1542
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -1823,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1823
1830
|
[](common_params & params, const std::string & value) {
|
|
1824
1831
|
params.sampling.top_n_sigma = std::stof(value);
|
|
1825
1832
|
}
|
|
1826
|
-
).
|
|
1833
|
+
).set_sparam());
|
|
1827
1834
|
add_opt(common_arg(
|
|
1828
1835
|
{"--xtc-probability"}, "N",
|
|
1829
1836
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -619,7 +619,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
619
619
|
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
|
620
620
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
621
621
|
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
622
|
-
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
|
|
623
622
|
default:
|
|
624
623
|
throw std::runtime_error("Unknown reasoning format");
|
|
625
624
|
}
|
|
@@ -239,12 +239,15 @@ struct common_params_diffusion {
|
|
|
239
239
|
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
|
240
240
|
};
|
|
241
241
|
|
|
242
|
+
// reasoning API response format (not to be confused as chat template's reasoning format)
|
|
242
243
|
enum common_reasoning_format {
|
|
243
244
|
COMMON_REASONING_FORMAT_NONE,
|
|
244
|
-
COMMON_REASONING_FORMAT_AUTO,
|
|
245
|
+
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|
|
245
246
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
246
247
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
247
|
-
|
|
248
|
+
// do not extend this enum unless you absolutely have to
|
|
249
|
+
// in most cases, use COMMON_REASONING_FORMAT_AUTO
|
|
250
|
+
// see: https://github.com/ggml-org/llama.cpp/pull/15408
|
|
248
251
|
};
|
|
249
252
|
|
|
250
253
|
|
|
@@ -373,7 +376,7 @@ struct common_params {
|
|
|
373
376
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
374
377
|
bool flash_attn = false; // flash attention
|
|
375
378
|
bool no_perf = false; // disable performance metrics
|
|
376
|
-
bool ctx_shift =
|
|
379
|
+
bool ctx_shift = false; // context shift on inifinite text generation
|
|
377
380
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
378
381
|
bool kv_unified = false; // enable unified KV cache
|
|
379
382
|
|
|
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
278
278
|
#endif
|
|
279
279
|
}
|
|
280
280
|
|
|
281
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
282
|
+
assert(nrc == 1);
|
|
283
|
+
UNUSED(nrc);
|
|
284
|
+
UNUSED(bx);
|
|
285
|
+
UNUSED(by);
|
|
286
|
+
UNUSED(bs);
|
|
287
|
+
assert(n % QK_MXFP4 == 0);
|
|
288
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
289
|
+
|
|
290
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
291
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
292
|
+
|
|
293
|
+
const int nb = n / QK_MXFP4;
|
|
294
|
+
|
|
295
|
+
int ib = 0;
|
|
296
|
+
float sumf = 0;
|
|
297
|
+
|
|
298
|
+
#if defined(__POWER9_VECTOR__)
|
|
299
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
300
|
+
const vector unsigned char vshift4 = vec_splats((unsigned char)4);
|
|
301
|
+
vector float vsumf0 = vec_splats(0.0f);
|
|
302
|
+
|
|
303
|
+
vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
|
|
304
|
+
|
|
305
|
+
#pragma GCC unroll 8
|
|
306
|
+
for (; ib < nb; ++ib) {
|
|
307
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
308
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
309
|
+
|
|
310
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
|
|
311
|
+
GGML_E8M0_TO_FP32_HALF(x[ib].e));
|
|
312
|
+
|
|
313
|
+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
|
|
314
|
+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
|
|
315
|
+
|
|
316
|
+
vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
|
|
317
|
+
|
|
318
|
+
vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
|
|
319
|
+
vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
|
|
320
|
+
|
|
321
|
+
vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
|
|
322
|
+
vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
|
|
323
|
+
|
|
324
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
|
325
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
|
326
|
+
|
|
327
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
|
328
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
|
329
|
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
|
330
|
+
|
|
331
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
|
335
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
|
336
|
+
sumf = vec_extract(vsumf0, 0);
|
|
337
|
+
*s = sumf;
|
|
338
|
+
#else
|
|
339
|
+
UNUSED(x);
|
|
340
|
+
UNUSED(y);
|
|
341
|
+
UNUSED(ib);
|
|
342
|
+
UNUSED(sumf);
|
|
343
|
+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
344
|
+
#endif
|
|
345
|
+
}
|
|
346
|
+
|
|
281
347
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
282
348
|
const int qk = QK8_0;
|
|
283
349
|
const int nb = n / qk;
|
|
@@ -73,7 +73,6 @@
|
|
|
73
73
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
|
74
74
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
|
75
75
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
|
76
|
-
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
|
77
76
|
// repack.cpp
|
|
78
77
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
79
78
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
@@ -145,11 +145,6 @@ llama_context::llama_context(
|
|
|
145
145
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
-
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
|
|
149
|
-
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
|
|
150
|
-
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
|
|
151
|
-
}
|
|
152
|
-
|
|
153
148
|
if (!hparams.vocab_only) {
|
|
154
149
|
// GPU backends
|
|
155
150
|
for (auto * dev : model.devices) {
|
|
@@ -86,6 +86,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
86
86
|
case LLM_TYPE_40B: return "40B";
|
|
87
87
|
case LLM_TYPE_65B: return "65B";
|
|
88
88
|
case LLM_TYPE_70B: return "70B";
|
|
89
|
+
case LLM_TYPE_120B: return "120B";
|
|
89
90
|
case LLM_TYPE_142B: return "142B";
|
|
90
91
|
case LLM_TYPE_236B: return "236B";
|
|
91
92
|
case LLM_TYPE_290B: return "290B";
|
|
@@ -1834,7 +1835,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1834
1835
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1835
1836
|
hparams.set_swa_pattern(2);
|
|
1836
1837
|
|
|
1837
|
-
|
|
1838
|
+
switch (hparams.n_layer) {
|
|
1839
|
+
case 24: type = LLM_TYPE_20B; break;
|
|
1840
|
+
case 36: type = LLM_TYPE_120B; break;
|
|
1841
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1842
|
+
}
|
|
1838
1843
|
} break;
|
|
1839
1844
|
case LLM_ARCH_LFM2:
|
|
1840
1845
|
{
|
|
@@ -6743,9 +6748,9 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6743
6748
|
|
|
6744
6749
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6745
6750
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6746
|
-
ggml_tensor * Vcur =
|
|
6751
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6747
6752
|
|
|
6748
|
-
Vcur =
|
|
6753
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6749
6754
|
|
|
6750
6755
|
// using mode = 2 for neox mode
|
|
6751
6756
|
Qcur = ggml_rope_ext(
|
|
@@ -7023,9 +7028,9 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7023
7028
|
|
|
7024
7029
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7025
7030
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7026
|
-
Vcur =
|
|
7031
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7027
7032
|
|
|
7028
|
-
Vcur =
|
|
7033
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7029
7034
|
|
|
7030
7035
|
Qcur = ggml_rope_ext(
|
|
7031
7036
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7145,13 +7150,13 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7145
7150
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7146
7151
|
cb(cur, "bqkv", il);
|
|
7147
7152
|
|
|
7148
|
-
ggml_tensor * Qcur =
|
|
7149
|
-
ggml_tensor * Kcur =
|
|
7150
|
-
ggml_tensor * Vcur =
|
|
7153
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7154
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7155
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7151
7156
|
|
|
7152
|
-
Qcur =
|
|
7153
|
-
Kcur =
|
|
7154
|
-
Vcur =
|
|
7157
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7158
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7159
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7155
7160
|
|
|
7156
7161
|
cb(Qcur, "Qcur", il);
|
|
7157
7162
|
cb(Kcur, "Kcur", il);
|
|
@@ -7367,13 +7372,15 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7367
7372
|
cb(cur, "bqkv", il);
|
|
7368
7373
|
}
|
|
7369
7374
|
|
|
7370
|
-
Qcur =
|
|
7371
|
-
Kcur =
|
|
7372
|
-
Vcur =
|
|
7375
|
+
Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7376
|
+
Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7377
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7378
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7373
7379
|
} else {
|
|
7374
7380
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7375
7381
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7376
7382
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7383
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7377
7384
|
}
|
|
7378
7385
|
|
|
7379
7386
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7381,6 +7388,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7381
7388
|
model.layers[il].attn_q_norm,
|
|
7382
7389
|
model.layers[il].attn_q_norm_b,
|
|
7383
7390
|
LLM_NORM, il);
|
|
7391
|
+
|
|
7392
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7393
|
+
} else {
|
|
7394
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7384
7395
|
}
|
|
7385
7396
|
|
|
7386
7397
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7388,11 +7399,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7388
7399
|
model.layers[il].attn_k_norm,
|
|
7389
7400
|
model.layers[il].attn_k_norm_b,
|
|
7390
7401
|
LLM_NORM, il);
|
|
7391
|
-
}
|
|
7392
7402
|
|
|
7393
|
-
|
|
7394
|
-
|
|
7395
|
-
|
|
7403
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7404
|
+
} else {
|
|
7405
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7406
|
+
}
|
|
7396
7407
|
|
|
7397
7408
|
// RoPE
|
|
7398
7409
|
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
@@ -7537,9 +7548,9 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7537
7548
|
|
|
7538
7549
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7539
7550
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7540
|
-
Vcur =
|
|
7551
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7541
7552
|
|
|
7542
|
-
Vcur =
|
|
7553
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7543
7554
|
|
|
7544
7555
|
// RoPE
|
|
7545
7556
|
Qcur = ggml_rope_ext(
|
|
@@ -7646,13 +7657,13 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7646
7657
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7647
7658
|
cb(cur, "bqkv", il);
|
|
7648
7659
|
|
|
7649
|
-
ggml_tensor * Qcur =
|
|
7650
|
-
ggml_tensor * Kcur =
|
|
7651
|
-
ggml_tensor * Vcur =
|
|
7660
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7661
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7662
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7652
7663
|
|
|
7653
|
-
Qcur =
|
|
7654
|
-
Kcur =
|
|
7655
|
-
Vcur =
|
|
7664
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7665
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7666
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7656
7667
|
|
|
7657
7668
|
cb(Qcur, "Qcur", il);
|
|
7658
7669
|
cb(Kcur, "Kcur", il);
|
|
@@ -7770,7 +7781,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7770
7781
|
|
|
7771
7782
|
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7772
7783
|
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7773
|
-
ggml_tensor * Vcur =
|
|
7784
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7774
7785
|
|
|
7775
7786
|
cb(Qcur, "Qcur", il);
|
|
7776
7787
|
cb(Kcur, "Kcur", il);
|
|
@@ -7789,17 +7800,18 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7789
7800
|
model.layers[il].attn_k_norm_b,
|
|
7790
7801
|
LLM_NORM, il);
|
|
7791
7802
|
cb(Kcur, "Kcur", il);
|
|
7803
|
+
|
|
7804
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7805
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7792
7806
|
} else {
|
|
7793
|
-
Qcur =
|
|
7807
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7794
7808
|
cb(Qcur, "Qcur", il);
|
|
7795
7809
|
|
|
7796
|
-
Kcur =
|
|
7810
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7797
7811
|
cb(Kcur, "Kcur", il);
|
|
7798
7812
|
}
|
|
7799
7813
|
|
|
7800
|
-
|
|
7801
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7802
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7814
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7803
7815
|
|
|
7804
7816
|
cb(Qcur, "Qcur", il);
|
|
7805
7817
|
cb(Kcur, "Kcur", il);
|
|
@@ -8051,9 +8063,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8051
8063
|
|
|
8052
8064
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8053
8065
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8054
|
-
ggml_tensor * Vcur =
|
|
8066
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8055
8067
|
|
|
8056
|
-
Vcur =
|
|
8068
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8057
8069
|
|
|
8058
8070
|
// using mode = 2 for neox mode
|
|
8059
8071
|
Qcur = ggml_rope_ext(
|
|
@@ -9026,21 +9038,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9026
9038
|
|
|
9027
9039
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9028
9040
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9029
|
-
Vcur =
|
|
9041
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9042
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9030
9043
|
} else {
|
|
9031
9044
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9032
9045
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9033
9046
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9034
9047
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9035
9048
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9049
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9036
9050
|
}
|
|
9037
9051
|
|
|
9038
9052
|
cb(Qcur, "Qcur", il);
|
|
9039
9053
|
cb(Kcur, "Kcur", il);
|
|
9040
9054
|
cb(Vcur, "Vcur", il);
|
|
9041
9055
|
|
|
9042
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9043
|
-
|
|
9044
9056
|
Qcur = ggml_rope_ext(
|
|
9045
9057
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9046
9058
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9164,21 +9176,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9164
9176
|
|
|
9165
9177
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9166
9178
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9167
|
-
Vcur =
|
|
9179
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9180
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9168
9181
|
} else {
|
|
9169
9182
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9170
9183
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9171
9184
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9172
9185
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9173
9186
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9187
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9174
9188
|
}
|
|
9175
9189
|
|
|
9176
9190
|
cb(Qcur, "Qcur", il);
|
|
9177
9191
|
cb(Kcur, "Kcur", il);
|
|
9178
9192
|
cb(Vcur, "Vcur", il);
|
|
9179
9193
|
|
|
9180
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9181
|
-
|
|
9182
9194
|
Qcur = ggml_rope_ext(
|
|
9183
9195
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9184
9196
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9428,17 +9440,17 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9428
9440
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9429
9441
|
cb(cur, "bqkv", il);
|
|
9430
9442
|
|
|
9431
|
-
ggml_tensor * Qcur =
|
|
9432
|
-
ggml_tensor * Kcur =
|
|
9433
|
-
ggml_tensor * Vcur =
|
|
9443
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9444
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9445
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9434
9446
|
|
|
9435
9447
|
cb(Qcur, "Qcur", il);
|
|
9436
9448
|
cb(Kcur, "Kcur", il);
|
|
9437
9449
|
cb(Vcur, "Vcur", il);
|
|
9438
9450
|
|
|
9439
|
-
Qcur =
|
|
9440
|
-
Kcur =
|
|
9441
|
-
Vcur =
|
|
9451
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9452
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9453
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9442
9454
|
|
|
9443
9455
|
cur = build_attn(inp_attn,
|
|
9444
9456
|
model.layers[il].wo, model.layers[il].bo,
|
|
@@ -9534,9 +9546,9 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9534
9546
|
|
|
9535
9547
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9536
9548
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9537
|
-
ggml_tensor * Vcur =
|
|
9549
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9538
9550
|
|
|
9539
|
-
Vcur =
|
|
9551
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9540
9552
|
|
|
9541
9553
|
Qcur = ggml_rope_ext(
|
|
9542
9554
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -10864,8 +10876,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10864
10876
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
10865
10877
|
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
10866
10878
|
cb(all_coefs, "all_coefs", il);
|
|
10867
|
-
all_coefs =
|
|
10868
|
-
all_coefs =
|
|
10879
|
+
all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
|
|
10880
|
+
all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
10869
10881
|
|
|
10870
10882
|
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
10871
10883
|
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
@@ -12278,9 +12290,9 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12278
12290
|
|
|
12279
12291
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12280
12292
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12281
|
-
ggml_tensor * Vcur =
|
|
12293
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12282
12294
|
|
|
12283
|
-
Vcur =
|
|
12295
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12284
12296
|
|
|
12285
12297
|
Qcur = ggml_rope_ext(
|
|
12286
12298
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13413,17 +13425,17 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13413
13425
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13414
13426
|
cb(cur, "bqkv", il);
|
|
13415
13427
|
|
|
13416
|
-
ggml_tensor * Qcur =
|
|
13417
|
-
ggml_tensor * Kcur =
|
|
13418
|
-
ggml_tensor * Vcur =
|
|
13428
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13429
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13430
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13419
13431
|
|
|
13420
13432
|
cb(Qcur, "Qcur", il);
|
|
13421
13433
|
cb(Kcur, "Kcur", il);
|
|
13422
13434
|
cb(Vcur, "Vcur", il);
|
|
13423
13435
|
|
|
13424
|
-
Qcur =
|
|
13425
|
-
Kcur =
|
|
13426
|
-
Vcur =
|
|
13436
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13437
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13438
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13427
13439
|
|
|
13428
13440
|
cur = build_attn(inp_attn,
|
|
13429
13441
|
model.layers[il].wo, model.layers[il].bo,
|
|
@@ -13526,6 +13538,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13526
13538
|
}
|
|
13527
13539
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13528
13540
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13541
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13529
13542
|
} else {
|
|
13530
13543
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13531
13544
|
cb(cur, "wqkv", il);
|
|
@@ -13535,11 +13548,10 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13535
13548
|
}
|
|
13536
13549
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13537
13550
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13538
|
-
Vcur =
|
|
13551
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13552
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13539
13553
|
}
|
|
13540
13554
|
|
|
13541
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13542
|
-
|
|
13543
13555
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
13544
13556
|
Qcur = ggml_rope_ext(
|
|
13545
13557
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13660,6 +13672,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13660
13672
|
}
|
|
13661
13673
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13662
13674
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13675
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13663
13676
|
} else {
|
|
13664
13677
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13665
13678
|
cb(cur, "wqkv", il);
|
|
@@ -13669,11 +13682,10 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13669
13682
|
}
|
|
13670
13683
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13671
13684
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13672
|
-
Vcur =
|
|
13685
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13686
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13673
13687
|
}
|
|
13674
13688
|
|
|
13675
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13676
|
-
|
|
13677
13689
|
Qcur = ggml_rope_ext(
|
|
13678
13690
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13679
13691
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -16840,13 +16852,13 @@ private:
|
|
|
16840
16852
|
|
|
16841
16853
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
16842
16854
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
16843
|
-
ggml_tensor * Vcur =
|
|
16855
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
16844
16856
|
|
|
16845
16857
|
cb(Qcur, "Qcur", il);
|
|
16846
16858
|
cb(Kcur, "Kcur", il);
|
|
16847
16859
|
cb(Vcur, "Vcur", il);
|
|
16848
16860
|
|
|
16849
|
-
Vcur =
|
|
16861
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
16850
16862
|
|
|
16851
16863
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16852
16864
|
cb(Qcur, "Qcur_normed", il);
|
|
@@ -16913,15 +16925,13 @@ private:
|
|
|
16913
16925
|
cb(zx, "mamba_in_proj", il);
|
|
16914
16926
|
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
|
16915
16927
|
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
|
16916
|
-
zx =
|
|
16917
|
-
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16928
|
+
zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16918
16929
|
cb(zx, "mamba_in_proj_out", il);
|
|
16919
16930
|
|
|
16920
16931
|
// split into z and x
|
|
16921
16932
|
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
|
16922
16933
|
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
|
16923
|
-
x =
|
|
16924
|
-
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16934
|
+
x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16925
16935
|
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
|
16926
16936
|
cb(x, "mamba_x_split", il);
|
|
16927
16937
|
|