@fugood/llama.node 1.1.7 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.7",
4
+ "version": "1.1.8",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.7",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.7",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.7",
77
- "@fugood/node-llama-linux-arm64": "1.1.7",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.7",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.7",
80
- "@fugood/node-llama-win32-x64": "1.1.7",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.7",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.7",
83
- "@fugood/node-llama-win32-arm64": "1.1.7",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.7",
85
- "@fugood/node-llama-darwin-x64": "1.1.7",
86
- "@fugood/node-llama-darwin-arm64": "1.1.7"
74
+ "@fugood/node-llama-linux-x64": "1.1.8",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.8",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.8",
77
+ "@fugood/node-llama-linux-arm64": "1.1.8",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.8",
80
+ "@fugood/node-llama-win32-x64": "1.1.8",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.8",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.8",
83
+ "@fugood/node-llama-win32-arm64": "1.1.8",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
85
+ "@fugood/node-llama-darwin-x64": "1.1.8",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.8"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -636,6 +636,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
636
636
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
637
637
  parallel_tool_calls, tool_choice, enable_thinking,
638
638
  add_generation_prompt, now_str, chat_template_kwargs);
639
+ } catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
640
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
641
+ return env.Undefined();
642
+ } catch (const std::invalid_argument& e) {
643
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
644
+ return env.Undefined();
645
+ } catch (const std::runtime_error& e) {
646
+ Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
647
+ return env.Undefined();
639
648
  } catch (const std::exception &e) {
640
649
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
641
650
  return env.Undefined();
package/src/common.hpp CHANGED
@@ -461,7 +461,14 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
461
461
  }
462
462
 
463
463
  // Clear all KV cache entries after position n_past
464
- llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
464
+ auto * kv = llama_get_memory(ctx);
465
+ bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
466
+ if (!clear_result) {
467
+ fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
468
+ llama_memory_clear(kv, false);
469
+ n_past = 0;
470
+ new_n_past = n_past;
471
+ }
465
472
 
466
473
  size_t num_chunks = mtmd_input_chunks_size(chunks);
467
474
 
@@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1530
1530
  params.ctx_shift = false;
1531
1531
  }
1532
1532
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1533
+ add_opt(common_arg(
1534
+ {"--context-shift"},
1535
+ string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1536
+ [](common_params & params) {
1537
+ params.ctx_shift = true;
1538
+ }
1539
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1533
1540
  add_opt(common_arg(
1534
1541
  {"--chunks"}, "N",
1535
1542
  string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1823,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1823
1830
  [](common_params & params, const std::string & value) {
1824
1831
  params.sampling.top_n_sigma = std::stof(value);
1825
1832
  }
1826
- ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
1833
+ ).set_sparam());
1827
1834
  add_opt(common_arg(
1828
1835
  {"--xtc-probability"}, "N",
1829
1836
  string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -619,7 +619,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
619
619
  case COMMON_REASONING_FORMAT_AUTO: return "auto";
620
620
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
621
621
  case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
622
- case COMMON_REASONING_FORMAT_GRANITE: return "granite";
623
622
  default:
624
623
  throw std::runtime_error("Unknown reasoning format");
625
624
  }
@@ -239,12 +239,15 @@ struct common_params_diffusion {
239
239
  bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
240
240
  };
241
241
 
242
+ // reasoning API response format (not to be confused as chat template's reasoning format)
242
243
  enum common_reasoning_format {
243
244
  COMMON_REASONING_FORMAT_NONE,
244
- COMMON_REASONING_FORMAT_AUTO,
245
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
245
246
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
246
247
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
247
- COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
248
+ // do not extend this enum unless you absolutely have to
249
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
250
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
248
251
  };
249
252
 
250
253
 
@@ -373,7 +376,7 @@ struct common_params {
373
376
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
374
377
  bool flash_attn = false; // flash attention
375
378
  bool no_perf = false; // disable performance metrics
376
- bool ctx_shift = true; // context shift on inifinite text generation
379
+ bool ctx_shift = false; // context shift on inifinite text generation
377
380
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
378
381
  bool kv_unified = false; // enable unified KV cache
379
382
 
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278
278
  #endif
279
279
  }
280
280
 
281
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282
+ assert(nrc == 1);
283
+ UNUSED(nrc);
284
+ UNUSED(bx);
285
+ UNUSED(by);
286
+ UNUSED(bs);
287
+ assert(n % QK_MXFP4 == 0);
288
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289
+
290
+ const block_mxfp4 * GGML_RESTRICT x = vx;
291
+ const block_q8_0 * GGML_RESTRICT y = vy;
292
+
293
+ const int nb = n / QK_MXFP4;
294
+
295
+ int ib = 0;
296
+ float sumf = 0;
297
+
298
+ #if defined(__POWER9_VECTOR__)
299
+ const vector signed char lowMask = vec_splats((signed char)0xF);
300
+ const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301
+ vector float vsumf0 = vec_splats(0.0f);
302
+
303
+ vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304
+
305
+ #pragma GCC unroll 8
306
+ for (; ib < nb; ++ib) {
307
+ __builtin_prefetch(x[ib].qs, 0, 1);
308
+ __builtin_prefetch(y[ib].qs, 0, 1);
309
+
310
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311
+ GGML_E8M0_TO_FP32_HALF(x[ib].e));
312
+
313
+ vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314
+ vector signed char q8y1 = vec_xl(16, y[ib].qs);
315
+
316
+ vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317
+
318
+ vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319
+ vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320
+
321
+ vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322
+ vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
323
+
324
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326
+
327
+ vector signed int vsumi0 = vec_splats((int32_t)0);
328
+ vsumi0 = vec_sum4s(qv0, vsumi0);
329
+ vsumi0 = vec_sum4s(qv1, vsumi0);
330
+
331
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
332
+ }
333
+
334
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336
+ sumf = vec_extract(vsumf0, 0);
337
+ *s = sumf;
338
+ #else
339
+ UNUSED(x);
340
+ UNUSED(y);
341
+ UNUSED(ib);
342
+ UNUSED(sumf);
343
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344
+ #endif
345
+ }
346
+
281
347
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282
348
  const int qk = QK8_0;
283
349
  const int nb = n / qk;
@@ -73,7 +73,6 @@
73
73
  #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
74
74
  #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
75
75
  #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
76
- #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
77
76
  // repack.cpp
78
77
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
79
78
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -145,11 +145,6 @@ llama_context::llama_context(
145
145
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
146
146
  }
147
147
 
148
- if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
149
- LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
150
- __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
151
- }
152
-
153
148
  if (!hparams.vocab_only) {
154
149
  // GPU backends
155
150
  for (auto * dev : model.devices) {
@@ -86,6 +86,7 @@ const char * llm_type_name(llm_type type) {
86
86
  case LLM_TYPE_40B: return "40B";
87
87
  case LLM_TYPE_65B: return "65B";
88
88
  case LLM_TYPE_70B: return "70B";
89
+ case LLM_TYPE_120B: return "120B";
89
90
  case LLM_TYPE_142B: return "142B";
90
91
  case LLM_TYPE_236B: return "236B";
91
92
  case LLM_TYPE_290B: return "290B";
@@ -1834,7 +1835,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1834
1835
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1835
1836
  hparams.set_swa_pattern(2);
1836
1837
 
1837
- // TODO: switch (hparams.n_layer)
1838
+ switch (hparams.n_layer) {
1839
+ case 24: type = LLM_TYPE_20B; break;
1840
+ case 36: type = LLM_TYPE_120B; break;
1841
+ default: type = LLM_TYPE_UNKNOWN;
1842
+ }
1838
1843
  } break;
1839
1844
  case LLM_ARCH_LFM2:
1840
1845
  {
@@ -6743,9 +6748,9 @@ struct llm_build_falcon : public llm_graph_context {
6743
6748
 
6744
6749
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6745
6750
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6746
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6751
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6747
6752
 
6748
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6753
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6749
6754
 
6750
6755
  // using mode = 2 for neox mode
6751
6756
  Qcur = ggml_rope_ext(
@@ -7023,9 +7028,9 @@ struct llm_build_dbrx : public llm_graph_context {
7023
7028
 
7024
7029
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7025
7030
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7026
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7031
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7027
7032
 
7028
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7033
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7029
7034
 
7030
7035
  Qcur = ggml_rope_ext(
7031
7036
  ctx0, Qcur, inp_pos, nullptr,
@@ -7145,13 +7150,13 @@ struct llm_build_starcoder : public llm_graph_context {
7145
7150
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7146
7151
  cb(cur, "bqkv", il);
7147
7152
 
7148
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7149
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7150
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7153
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7154
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7155
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7151
7156
 
7152
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7153
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7154
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7157
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7158
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7159
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7155
7160
 
7156
7161
  cb(Qcur, "Qcur", il);
7157
7162
  cb(Kcur, "Kcur", il);
@@ -7367,13 +7372,15 @@ struct llm_build_bert : public llm_graph_context {
7367
7372
  cb(cur, "bqkv", il);
7368
7373
  }
7369
7374
 
7370
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7371
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7372
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7375
+ Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7376
+ Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7377
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7378
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7373
7379
  } else {
7374
7380
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7375
7381
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7376
7382
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7383
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7377
7384
  }
7378
7385
 
7379
7386
  if (model.layers[il].attn_q_norm) {
@@ -7381,6 +7388,10 @@ struct llm_build_bert : public llm_graph_context {
7381
7388
  model.layers[il].attn_q_norm,
7382
7389
  model.layers[il].attn_q_norm_b,
7383
7390
  LLM_NORM, il);
7391
+
7392
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7393
+ } else {
7394
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7384
7395
  }
7385
7396
 
7386
7397
  if (model.layers[il].attn_k_norm) {
@@ -7388,11 +7399,11 @@ struct llm_build_bert : public llm_graph_context {
7388
7399
  model.layers[il].attn_k_norm,
7389
7400
  model.layers[il].attn_k_norm_b,
7390
7401
  LLM_NORM, il);
7391
- }
7392
7402
 
7393
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7394
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7395
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7403
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7404
+ } else {
7405
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7406
+ }
7396
7407
 
7397
7408
  // RoPE
7398
7409
  if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
@@ -7537,9 +7548,9 @@ struct llm_build_neo_bert : public llm_graph_context {
7537
7548
 
7538
7549
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7539
7550
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7540
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7551
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7541
7552
 
7542
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7553
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7543
7554
 
7544
7555
  // RoPE
7545
7556
  Qcur = ggml_rope_ext(
@@ -7646,13 +7657,13 @@ struct llm_build_bloom : public llm_graph_context {
7646
7657
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7647
7658
  cb(cur, "bqkv", il);
7648
7659
 
7649
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7650
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7651
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7660
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7661
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7662
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7652
7663
 
7653
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7654
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7655
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7664
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7665
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7666
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7656
7667
 
7657
7668
  cb(Qcur, "Qcur", il);
7658
7669
  cb(Kcur, "Kcur", il);
@@ -7770,7 +7781,7 @@ struct llm_build_mpt : public llm_graph_context {
7770
7781
 
7771
7782
  ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7772
7783
  ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7773
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7784
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7774
7785
 
7775
7786
  cb(Qcur, "Qcur", il);
7776
7787
  cb(Kcur, "Kcur", il);
@@ -7789,17 +7800,18 @@ struct llm_build_mpt : public llm_graph_context {
7789
7800
  model.layers[il].attn_k_norm_b,
7790
7801
  LLM_NORM, il);
7791
7802
  cb(Kcur, "Kcur", il);
7803
+
7804
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7805
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7792
7806
  } else {
7793
- Qcur = ggml_cont(ctx0, Qcur);
7807
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7794
7808
  cb(Qcur, "Qcur", il);
7795
7809
 
7796
- Kcur = ggml_cont(ctx0, Kcur);
7810
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7797
7811
  cb(Kcur, "Kcur", il);
7798
7812
  }
7799
7813
 
7800
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7801
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7802
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7814
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7803
7815
 
7804
7816
  cb(Qcur, "Qcur", il);
7805
7817
  cb(Kcur, "Kcur", il);
@@ -8051,9 +8063,9 @@ struct llm_build_qwen : public llm_graph_context {
8051
8063
 
8052
8064
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8053
8065
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8054
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
8066
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8055
8067
 
8056
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8068
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8057
8069
 
8058
8070
  // using mode = 2 for neox mode
8059
8071
  Qcur = ggml_rope_ext(
@@ -9026,21 +9038,21 @@ struct llm_build_phi2 : public llm_graph_context {
9026
9038
 
9027
9039
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9028
9040
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9029
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9041
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9042
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9030
9043
  } else {
9031
9044
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9032
9045
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9033
9046
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9034
9047
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9035
9048
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9049
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9036
9050
  }
9037
9051
 
9038
9052
  cb(Qcur, "Qcur", il);
9039
9053
  cb(Kcur, "Kcur", il);
9040
9054
  cb(Vcur, "Vcur", il);
9041
9055
 
9042
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9043
-
9044
9056
  Qcur = ggml_rope_ext(
9045
9057
  ctx0, Qcur, inp_pos, nullptr,
9046
9058
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9164,21 +9176,21 @@ struct llm_build_phi3 : public llm_graph_context {
9164
9176
 
9165
9177
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9166
9178
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9167
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9179
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9180
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9168
9181
  } else {
9169
9182
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9170
9183
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9171
9184
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9172
9185
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9173
9186
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9187
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9174
9188
  }
9175
9189
 
9176
9190
  cb(Qcur, "Qcur", il);
9177
9191
  cb(Kcur, "Kcur", il);
9178
9192
  cb(Vcur, "Vcur", il);
9179
9193
 
9180
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9181
-
9182
9194
  Qcur = ggml_rope_ext(
9183
9195
  ctx0, Qcur, inp_pos, rope_factors,
9184
9196
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9428,17 +9440,17 @@ struct llm_build_gpt2 : public llm_graph_context {
9428
9440
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9429
9441
  cb(cur, "bqkv", il);
9430
9442
 
9431
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
9432
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
9433
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9443
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9444
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9445
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9434
9446
 
9435
9447
  cb(Qcur, "Qcur", il);
9436
9448
  cb(Kcur, "Kcur", il);
9437
9449
  cb(Vcur, "Vcur", il);
9438
9450
 
9439
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9440
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9441
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9451
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9452
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9453
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9442
9454
 
9443
9455
  cur = build_attn(inp_attn,
9444
9456
  model.layers[il].wo, model.layers[il].bo,
@@ -9534,9 +9546,9 @@ struct llm_build_codeshell : public llm_graph_context {
9534
9546
 
9535
9547
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9536
9548
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9537
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9549
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9538
9550
 
9539
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9551
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9540
9552
 
9541
9553
  Qcur = ggml_rope_ext(
9542
9554
  ctx0, Qcur, inp_pos, nullptr,
@@ -10864,8 +10876,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10864
10876
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
10865
10877
  all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
10866
10878
  cb(all_coefs, "all_coefs", il);
10867
- all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
10868
- all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10879
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
10880
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10869
10881
 
10870
10882
  innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
10871
10883
  ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
@@ -12278,9 +12290,9 @@ struct llm_build_gptneox : public llm_graph_context {
12278
12290
 
12279
12291
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12280
12292
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12281
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12293
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12282
12294
 
12283
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12295
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12284
12296
 
12285
12297
  Qcur = ggml_rope_ext(
12286
12298
  ctx0, Qcur, inp_pos, nullptr,
@@ -13413,17 +13425,17 @@ struct llm_build_jais : public llm_graph_context {
13413
13425
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13414
13426
  cb(cur, "bqkv", il);
13415
13427
 
13416
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
13417
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
13418
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
13428
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13429
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13430
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13419
13431
 
13420
13432
  cb(Qcur, "Qcur", il);
13421
13433
  cb(Kcur, "Kcur", il);
13422
13434
  cb(Vcur, "Vcur", il);
13423
13435
 
13424
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13425
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13426
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13436
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13437
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13438
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13427
13439
 
13428
13440
  cur = build_attn(inp_attn,
13429
13441
  model.layers[il].wo, model.layers[il].bo,
@@ -13526,6 +13538,7 @@ struct llm_build_chatglm : public llm_graph_context {
13526
13538
  }
13527
13539
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13528
13540
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13541
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13529
13542
  } else {
13530
13543
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13531
13544
  cb(cur, "wqkv", il);
@@ -13535,11 +13548,10 @@ struct llm_build_chatglm : public llm_graph_context {
13535
13548
  }
13536
13549
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13537
13550
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13538
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13551
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13552
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13539
13553
  }
13540
13554
 
13541
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13542
-
13543
13555
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
13544
13556
  Qcur = ggml_rope_ext(
13545
13557
  ctx0, Qcur, inp_pos, nullptr,
@@ -13660,6 +13672,7 @@ struct llm_build_glm4 : public llm_graph_context {
13660
13672
  }
13661
13673
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13662
13674
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13675
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13663
13676
  } else {
13664
13677
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13665
13678
  cb(cur, "wqkv", il);
@@ -13669,11 +13682,10 @@ struct llm_build_glm4 : public llm_graph_context {
13669
13682
  }
13670
13683
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13671
13684
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13672
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13685
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13686
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13673
13687
  }
13674
13688
 
13675
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13676
-
13677
13689
  Qcur = ggml_rope_ext(
13678
13690
  ctx0, Qcur, inp_pos, nullptr,
13679
13691
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -16840,13 +16852,13 @@ private:
16840
16852
 
16841
16853
  ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
16842
16854
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
16843
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
16855
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
16844
16856
 
16845
16857
  cb(Qcur, "Qcur", il);
16846
16858
  cb(Kcur, "Kcur", il);
16847
16859
  cb(Vcur, "Vcur", il);
16848
16860
 
16849
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16861
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16850
16862
 
16851
16863
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16852
16864
  cb(Qcur, "Qcur_normed", il);
@@ -16913,15 +16925,13 @@ private:
16913
16925
  cb(zx, "mamba_in_proj", il);
16914
16926
  // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
16915
16927
  zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
16916
- zx = ggml_cont(ctx0, zx);
16917
- zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16928
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16918
16929
  cb(zx, "mamba_in_proj_out", il);
16919
16930
 
16920
16931
  // split into z and x
16921
16932
  // => {head_dim * n_heads, n_seq_tokens, n_seqs}
16922
16933
  ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
16923
- x = ggml_cont(ctx0, x);
16924
- x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16934
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16925
16935
  // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
16926
16936
  cb(x, "mamba_x_split", il);
16927
16937
 
@@ -79,6 +79,7 @@ enum llm_type {
79
79
  LLM_TYPE_40B,
80
80
  LLM_TYPE_65B,
81
81
  LLM_TYPE_70B,
82
+ LLM_TYPE_120B,
82
83
  LLM_TYPE_142B,
83
84
  LLM_TYPE_236B,
84
85
  LLM_TYPE_290B,