@fugood/llama.node 0.3.15 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/examples/server/server.cpp +5 -0
  19. package/src/llama.cpp/examples/tts/tts.cpp +8 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +31 -27
  23. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +32 -12
  24. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +27 -1
  25. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  26. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -6
  27. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +46 -12
  28. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +4 -2
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -0
  30. package/src/llama.cpp/src/llama-model.cpp +65 -38
  31. package/src/llama.cpp/tests/test-backend-ops.cpp +57 -14
@@ -271,19 +271,32 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
271
271
  }
272
272
  }
273
273
 
274
- // add extra buffer types
275
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
276
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
277
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
278
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
279
- if (ggml_backend_dev_get_extra_bufts_fn) {
280
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
281
- while (extra_bufts && *extra_bufts) {
282
- buft_list.emplace_back(cpu_dev, *extra_bufts);
283
- ++extra_bufts;
274
+ bool has_gpu_device = false;
275
+ for (auto * dev : devices) {
276
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
277
+ has_gpu_device = true;
278
+ break;
284
279
  }
285
280
  }
286
281
 
282
+ // add extra buffer types, only if no GPU device is present
283
+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
284
+ if (!has_gpu_device) {
285
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
286
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
287
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
288
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
289
+ if (ggml_backend_dev_get_extra_bufts_fn) {
290
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
291
+ while (extra_bufts && *extra_bufts) {
292
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
293
+ ++extra_bufts;
294
+ }
295
+ }
296
+ } else {
297
+ LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
298
+ }
299
+
287
300
  // add a host buffer type
288
301
  // storing the tensors in a host buffer is useful when the processing of large batches
289
302
  // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -2210,9 +2223,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2210
2223
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2211
2224
 
2212
2225
  // optional bias tensors
2213
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2214
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2215
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2226
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2227
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2228
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2216
2229
 
2217
2230
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2218
2231
 
@@ -2329,7 +2342,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2329
2342
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2330
2343
  layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
2331
2344
 
2332
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
2345
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
2333
2346
  if (layer.wqkv == nullptr) {
2334
2347
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2335
2348
  layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
@@ -2558,7 +2571,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2558
2571
 
2559
2572
  // output
2560
2573
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2561
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2574
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2575
+
2576
+ // if output is NULL, init from the input tok embed
2577
+ if (output == NULL) {
2578
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2579
+ }
2562
2580
 
2563
2581
  for (int i = 0; i < n_layer; ++i) {
2564
2582
  auto & layer = layers[i];
@@ -3215,16 +3233,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3215
3233
  auto & layer = layers[i];
3216
3234
 
3217
3235
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3218
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3219
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3236
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3237
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3220
3238
 
3221
3239
  if (layer.wqkv == nullptr) {
3222
3240
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3223
3241
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3224
3242
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3225
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3226
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3227
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3243
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3244
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3245
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3228
3246
  }
3229
3247
 
3230
3248
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -3335,12 +3353,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3335
3353
  layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
3336
3354
 
3337
3355
  layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3338
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3339
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3340
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3341
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3342
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3343
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
3356
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3357
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3358
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3359
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3360
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3361
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
3344
3362
  GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
3345
3363
 
3346
3364
  layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
@@ -3370,7 +3388,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3370
3388
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3371
3389
 
3372
3390
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3373
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3391
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3374
3392
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3375
3393
 
3376
3394
  const int time_mix_extra_dim = hparams.time_mix_extra_dim;
@@ -3396,7 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3396
3414
  layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3397
3415
  layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
3398
3416
 
3399
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3417
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
3400
3418
  layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
3401
3419
  layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
3402
3420
  layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
@@ -3405,9 +3423,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3405
3423
  layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3406
3424
  layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
3407
3425
  // optional bias tensors
3408
- layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3409
- layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3410
- layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3426
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
3427
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
3428
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
3411
3429
 
3412
3430
  layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3413
3431
 
@@ -3528,8 +3546,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3528
3546
  layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
3529
3547
  }
3530
3548
 
3531
- layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
3532
- layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3549
+ layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
3550
+ layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
3533
3551
 
3534
3552
  try {
3535
3553
  layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
@@ -3546,8 +3564,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3546
3564
  layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
3547
3565
  layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3548
3566
 
3549
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3550
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3567
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3568
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3551
3569
  layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3552
3570
 
3553
3571
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -6193,16 +6211,25 @@ struct llm_build_qwen2moe : public llm_graph_context {
6193
6211
  {
6194
6212
  // compute Q and K and RoPE them
6195
6213
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6196
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6197
6214
  cb(Qcur, "Qcur", il);
6215
+ if (model.layers[il].bq) {
6216
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6217
+ cb(Qcur, "Qcur", il);
6218
+ }
6198
6219
 
6199
6220
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6200
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6201
6221
  cb(Kcur, "Kcur", il);
6222
+ if (model.layers[il].bk) {
6223
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6224
+ cb(Kcur, "Kcur", il);
6225
+ }
6202
6226
 
6203
6227
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6204
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6205
6228
  cb(Vcur, "Vcur", il);
6229
+ if (model.layers[il].bv) {
6230
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6231
+ cb(Vcur, "Vcur", il);
6232
+ }
6206
6233
 
6207
6234
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6208
6235
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -1463,11 +1463,13 @@ struct test_cpy : public test_case {
1463
1463
  const ggml_type type_src;
1464
1464
  const ggml_type type_dst;
1465
1465
  const std::array<int64_t, 4> ne;
1466
- const std::array<int64_t, 4> permute;
1466
+ const std::array<int64_t, 4> permute_src;
1467
+ const std::array<int64_t, 4> permute_dst;
1467
1468
  bool _src_use_permute;
1469
+ bool _dst_use_permute;
1468
1470
 
1469
1471
  std::string vars() override {
1470
- return VARS_TO_STR4(type_src, type_dst, ne, permute);
1472
+ return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
1471
1473
  }
1472
1474
 
1473
1475
  double max_nmse_err() override {
@@ -1480,9 +1482,11 @@ struct test_cpy : public test_case {
1480
1482
 
1481
1483
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
1482
1484
  std::array<int64_t, 4> ne = {10, 10, 10, 1},
1483
- std::array<int64_t, 4> permute = {0, 0, 0, 0})
1484
- : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
1485
- _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
1485
+ std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
1486
+ std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
1487
+ : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
1488
+ _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
1489
+ _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
1486
1490
 
1487
1491
  ggml_tensor * build_graph(ggml_context * ctx) override {
1488
1492
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
@@ -1490,13 +1494,18 @@ struct test_cpy : public test_case {
1490
1494
  ggml_set_name(src, "src");
1491
1495
 
1492
1496
  if (_src_use_permute) {
1493
- src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
1497
+ src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]);
1494
1498
  ggml_set_name(src, "src_permuted");
1495
1499
  }
1496
1500
 
1497
- ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
1501
+ ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
1498
1502
  ggml_set_name(dst, "dst");
1499
1503
 
1504
+ if (_dst_use_permute) {
1505
+ dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
1506
+ ggml_set_name(dst, "dst_permuted");
1507
+ }
1508
+
1500
1509
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
1501
1510
  ggml_set_name(out, "out");
1502
1511
 
@@ -1964,9 +1973,10 @@ struct test_mul_mat : public test_case {
1964
1973
  const std::array<int64_t, 2> bs; // dims 3 and 4
1965
1974
  const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1966
1975
  const std::array<int64_t, 4> per; // permutation of dimensions
1976
+ const bool v; // whether a is a non-contiguous view
1967
1977
 
1968
1978
  std::string vars() override {
1969
- return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
1979
+ return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
1970
1980
  }
1971
1981
 
1972
1982
  double max_nmse_err() override {
@@ -1986,8 +1996,9 @@ struct test_mul_mat : public test_case {
1986
1996
  int64_t m = 32, int64_t n = 32, int64_t k = 32,
1987
1997
  std::array<int64_t, 2> bs = {10, 10},
1988
1998
  std::array<int64_t, 2> nr = {2, 2},
1989
- std::array<int64_t, 4> per = {0, 1, 2, 3})
1990
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
1999
+ std::array<int64_t, 4> per = {0, 1, 2, 3},
2000
+ bool v = false)
2001
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
1991
2002
 
1992
2003
  ggml_tensor * build_graph(ggml_context * ctx) override {
1993
2004
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -1997,6 +2008,7 @@ struct test_mul_mat : public test_case {
1997
2008
  const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
1998
2009
  if (npermuted > 0) {
1999
2010
  GGML_ASSERT(npermuted == 2);
2011
+ GGML_ASSERT(!v); // not handled
2000
2012
  GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
2001
2013
  GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
2002
2014
 
@@ -2020,7 +2032,13 @@ struct test_mul_mat : public test_case {
2020
2032
  ggml_set_name(a, "a_permuted");
2021
2033
  ggml_set_name(b, "b_permuted");
2022
2034
  } else {
2023
- a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
2035
+
2036
+ if (v) {
2037
+ a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
2038
+ a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
2039
+ } else {
2040
+ a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
2041
+ }
2024
2042
  b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
2025
2043
  if (!ggml_is_quantized(type_a)) {
2026
2044
  if (bs[1] == 1 && nr[1] == 1) {
@@ -3995,14 +4013,25 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3995
4013
  test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
3996
4014
  }
3997
4015
 
3998
- for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
4016
+ // same-type copy
4017
+ for (ggml_type type : all_types) {
4018
+ const auto nk = ggml_blck_size(type);
4019
+
4020
+ for (int k = 1; k < 4; ++k) {
4021
+ test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
4022
+ test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
4023
+ test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
4024
+ }
4025
+ }
4026
+
4027
+ for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
3999
4028
  for (ggml_type type_dst : all_types) {
4000
4029
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
4001
4030
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
4002
4031
  }
4003
4032
  }
4004
- for (ggml_type type_dst : {GGML_TYPE_F32}) {
4005
- for (ggml_type type_src : all_types) {
4033
+ for (ggml_type type_src : all_types) {
4034
+ for (ggml_type type_dst : {GGML_TYPE_F32}) {
4006
4035
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
4007
4036
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
4008
4037
  }
@@ -4176,6 +4205,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4176
4205
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
4177
4206
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
4178
4207
 
4208
+ for (auto bs : {1,2,4,8}) {
4209
+ for (auto nr : {1,4}) {
4210
+ for (uint32_t m = 0; m < 2; ++m) {
4211
+ for (uint32_t k = 0; k < 2; ++k) {
4212
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
4213
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
4214
+ }
4215
+ }
4216
+ }
4217
+ }
4218
+
4179
4219
  // sycl backend will limit task global_range < MAX_INT
4180
4220
  // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
4181
4221
  // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
@@ -4444,6 +4484,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
4444
4484
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
4445
4485
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
4446
4486
 
4487
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
4488
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
4489
+
4447
4490
  for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
4448
4491
  for (ggml_type type_a : all_types) {
4449
4492
  for (ggml_type type_b : {GGML_TYPE_F32}) {