@fugood/llama.node 0.3.15 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/examples/server/server.cpp +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +8 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +31 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +32 -12
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +27 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +46 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +4 -2
- package/src/llama.cpp/src/llama-arch.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +65 -38
- package/src/llama.cpp/tests/test-backend-ops.cpp +57 -14
|
@@ -271,19 +271,32 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
271
271
|
}
|
|
272
272
|
}
|
|
273
273
|
|
|
274
|
-
|
|
275
|
-
auto *
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
280
|
-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
281
|
-
while (extra_bufts && *extra_bufts) {
|
|
282
|
-
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
283
|
-
++extra_bufts;
|
|
274
|
+
bool has_gpu_device = false;
|
|
275
|
+
for (auto * dev : devices) {
|
|
276
|
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
277
|
+
has_gpu_device = true;
|
|
278
|
+
break;
|
|
284
279
|
}
|
|
285
280
|
}
|
|
286
281
|
|
|
282
|
+
// add extra buffer types, only if no GPU device is present
|
|
283
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
284
|
+
if (!has_gpu_device) {
|
|
285
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
286
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
287
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
288
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
289
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
290
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
291
|
+
while (extra_bufts && *extra_bufts) {
|
|
292
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
293
|
+
++extra_bufts;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
} else {
|
|
297
|
+
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
|
|
298
|
+
}
|
|
299
|
+
|
|
287
300
|
// add a host buffer type
|
|
288
301
|
// storing the tensors in a host buffer is useful when the processing of large batches
|
|
289
302
|
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
|
@@ -2210,9 +2223,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2210
2223
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2211
2224
|
|
|
2212
2225
|
// optional bias tensors
|
|
2213
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
2214
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
2215
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
2226
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2227
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2228
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2216
2229
|
|
|
2217
2230
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2218
2231
|
|
|
@@ -2329,7 +2342,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2329
2342
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2330
2343
|
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
|
|
2331
2344
|
|
|
2332
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
|
2345
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
2333
2346
|
if (layer.wqkv == nullptr) {
|
|
2334
2347
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2335
2348
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
@@ -2558,7 +2571,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2558
2571
|
|
|
2559
2572
|
// output
|
|
2560
2573
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2561
|
-
output = create_tensor(tn(
|
|
2574
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2575
|
+
|
|
2576
|
+
// if output is NULL, init from the input tok embed
|
|
2577
|
+
if (output == NULL) {
|
|
2578
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2579
|
+
}
|
|
2562
2580
|
|
|
2563
2581
|
for (int i = 0; i < n_layer; ++i) {
|
|
2564
2582
|
auto & layer = layers[i];
|
|
@@ -3215,16 +3233,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3215
3233
|
auto & layer = layers[i];
|
|
3216
3234
|
|
|
3217
3235
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3218
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
|
3219
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
3236
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3237
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3220
3238
|
|
|
3221
3239
|
if (layer.wqkv == nullptr) {
|
|
3222
3240
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3223
3241
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3224
3242
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3225
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
3226
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
3227
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
3243
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3244
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3245
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3228
3246
|
}
|
|
3229
3247
|
|
|
3230
3248
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
@@ -3335,12 +3353,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3335
3353
|
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
3336
3354
|
|
|
3337
3355
|
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3338
|
-
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1},
|
|
3339
|
-
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1},
|
|
3340
|
-
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1},
|
|
3341
|
-
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1},
|
|
3342
|
-
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1},
|
|
3343
|
-
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5},
|
|
3356
|
+
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3357
|
+
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3358
|
+
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3359
|
+
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3360
|
+
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3361
|
+
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
|
|
3344
3362
|
GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
|
|
3345
3363
|
|
|
3346
3364
|
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
|
|
@@ -3370,7 +3388,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3370
3388
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3371
3389
|
|
|
3372
3390
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3373
|
-
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
|
3391
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3374
3392
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3375
3393
|
|
|
3376
3394
|
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
|
@@ -3396,7 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3396
3414
|
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3397
3415
|
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
|
|
3398
3416
|
|
|
3399
|
-
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size},
|
|
3417
|
+
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
|
|
3400
3418
|
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
3401
3419
|
layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
|
3402
3420
|
layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
|
@@ -3405,9 +3423,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3405
3423
|
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3406
3424
|
layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3407
3425
|
// optional bias tensors
|
|
3408
|
-
layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size},
|
|
3409
|
-
layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size},
|
|
3410
|
-
layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size},
|
|
3426
|
+
layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
|
|
3427
|
+
layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
|
|
3428
|
+
layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
|
|
3411
3429
|
|
|
3412
3430
|
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3413
3431
|
|
|
@@ -3528,8 +3546,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3528
3546
|
layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
|
|
3529
3547
|
}
|
|
3530
3548
|
|
|
3531
|
-
layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate},
|
|
3532
|
-
layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd},
|
|
3549
|
+
layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
|
|
3550
|
+
layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3533
3551
|
|
|
3534
3552
|
try {
|
|
3535
3553
|
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
|
|
@@ -3546,8 +3564,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3546
3564
|
layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3547
3565
|
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3548
3566
|
|
|
3549
|
-
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd},
|
|
3550
|
-
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd},
|
|
3567
|
+
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3568
|
+
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3551
3569
|
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3552
3570
|
|
|
3553
3571
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -6193,16 +6211,25 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6193
6211
|
{
|
|
6194
6212
|
// compute Q and K and RoPE them
|
|
6195
6213
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6196
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
6197
6214
|
cb(Qcur, "Qcur", il);
|
|
6215
|
+
if (model.layers[il].bq) {
|
|
6216
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
6217
|
+
cb(Qcur, "Qcur", il);
|
|
6218
|
+
}
|
|
6198
6219
|
|
|
6199
6220
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6200
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
6201
6221
|
cb(Kcur, "Kcur", il);
|
|
6222
|
+
if (model.layers[il].bk) {
|
|
6223
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
6224
|
+
cb(Kcur, "Kcur", il);
|
|
6225
|
+
}
|
|
6202
6226
|
|
|
6203
6227
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6204
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
6205
6228
|
cb(Vcur, "Vcur", il);
|
|
6229
|
+
if (model.layers[il].bv) {
|
|
6230
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
6231
|
+
cb(Vcur, "Vcur", il);
|
|
6232
|
+
}
|
|
6206
6233
|
|
|
6207
6234
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6208
6235
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
@@ -1463,11 +1463,13 @@ struct test_cpy : public test_case {
|
|
|
1463
1463
|
const ggml_type type_src;
|
|
1464
1464
|
const ggml_type type_dst;
|
|
1465
1465
|
const std::array<int64_t, 4> ne;
|
|
1466
|
-
const std::array<int64_t, 4>
|
|
1466
|
+
const std::array<int64_t, 4> permute_src;
|
|
1467
|
+
const std::array<int64_t, 4> permute_dst;
|
|
1467
1468
|
bool _src_use_permute;
|
|
1469
|
+
bool _dst_use_permute;
|
|
1468
1470
|
|
|
1469
1471
|
std::string vars() override {
|
|
1470
|
-
return
|
|
1472
|
+
return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
|
|
1471
1473
|
}
|
|
1472
1474
|
|
|
1473
1475
|
double max_nmse_err() override {
|
|
@@ -1480,9 +1482,11 @@ struct test_cpy : public test_case {
|
|
|
1480
1482
|
|
|
1481
1483
|
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
|
1482
1484
|
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
|
1483
|
-
std::array<int64_t, 4>
|
|
1484
|
-
|
|
1485
|
-
|
|
1485
|
+
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
|
|
1486
|
+
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
|
|
1487
|
+
: type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
|
|
1488
|
+
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
|
|
1489
|
+
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
|
|
1486
1490
|
|
|
1487
1491
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1488
1492
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
|
@@ -1490,13 +1494,18 @@ struct test_cpy : public test_case {
|
|
|
1490
1494
|
ggml_set_name(src, "src");
|
|
1491
1495
|
|
|
1492
1496
|
if (_src_use_permute) {
|
|
1493
|
-
src = ggml_permute(ctx, src,
|
|
1497
|
+
src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]);
|
|
1494
1498
|
ggml_set_name(src, "src_permuted");
|
|
1495
1499
|
}
|
|
1496
1500
|
|
|
1497
|
-
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
|
1501
|
+
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
|
1498
1502
|
ggml_set_name(dst, "dst");
|
|
1499
1503
|
|
|
1504
|
+
if (_dst_use_permute) {
|
|
1505
|
+
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
|
|
1506
|
+
ggml_set_name(dst, "dst_permuted");
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1500
1509
|
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
|
1501
1510
|
ggml_set_name(out, "out");
|
|
1502
1511
|
|
|
@@ -1964,9 +1973,10 @@ struct test_mul_mat : public test_case {
|
|
|
1964
1973
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
1965
1974
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
1966
1975
|
const std::array<int64_t, 4> per; // permutation of dimensions
|
|
1976
|
+
const bool v; // whether a is a non-contiguous view
|
|
1967
1977
|
|
|
1968
1978
|
std::string vars() override {
|
|
1969
|
-
return
|
|
1979
|
+
return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
|
|
1970
1980
|
}
|
|
1971
1981
|
|
|
1972
1982
|
double max_nmse_err() override {
|
|
@@ -1986,8 +1996,9 @@ struct test_mul_mat : public test_case {
|
|
|
1986
1996
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
1987
1997
|
std::array<int64_t, 2> bs = {10, 10},
|
|
1988
1998
|
std::array<int64_t, 2> nr = {2, 2},
|
|
1989
|
-
std::array<int64_t, 4> per = {0, 1, 2, 3}
|
|
1990
|
-
|
|
1999
|
+
std::array<int64_t, 4> per = {0, 1, 2, 3},
|
|
2000
|
+
bool v = false)
|
|
2001
|
+
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
|
|
1991
2002
|
|
|
1992
2003
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1993
2004
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
@@ -1997,6 +2008,7 @@ struct test_mul_mat : public test_case {
|
|
|
1997
2008
|
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
|
1998
2009
|
if (npermuted > 0) {
|
|
1999
2010
|
GGML_ASSERT(npermuted == 2);
|
|
2011
|
+
GGML_ASSERT(!v); // not handled
|
|
2000
2012
|
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
|
2001
2013
|
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
|
2002
2014
|
|
|
@@ -2020,7 +2032,13 @@ struct test_mul_mat : public test_case {
|
|
|
2020
2032
|
ggml_set_name(a, "a_permuted");
|
|
2021
2033
|
ggml_set_name(b, "b_permuted");
|
|
2022
2034
|
} else {
|
|
2023
|
-
|
|
2035
|
+
|
|
2036
|
+
if (v) {
|
|
2037
|
+
a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
|
|
2038
|
+
a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
2039
|
+
} else {
|
|
2040
|
+
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
|
2041
|
+
}
|
|
2024
2042
|
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
|
2025
2043
|
if (!ggml_is_quantized(type_a)) {
|
|
2026
2044
|
if (bs[1] == 1 && nr[1] == 1) {
|
|
@@ -3995,14 +4013,25 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3995
4013
|
test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
|
|
3996
4014
|
}
|
|
3997
4015
|
|
|
3998
|
-
|
|
4016
|
+
// same-type copy
|
|
4017
|
+
for (ggml_type type : all_types) {
|
|
4018
|
+
const auto nk = ggml_blck_size(type);
|
|
4019
|
+
|
|
4020
|
+
for (int k = 1; k < 4; ++k) {
|
|
4021
|
+
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
|
|
4022
|
+
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
|
|
4023
|
+
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
|
|
4024
|
+
}
|
|
4025
|
+
}
|
|
4026
|
+
|
|
4027
|
+
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
|
3999
4028
|
for (ggml_type type_dst : all_types) {
|
|
4000
4029
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
|
4001
4030
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
4002
4031
|
}
|
|
4003
4032
|
}
|
|
4004
|
-
for (ggml_type
|
|
4005
|
-
for (ggml_type
|
|
4033
|
+
for (ggml_type type_src : all_types) {
|
|
4034
|
+
for (ggml_type type_dst : {GGML_TYPE_F32}) {
|
|
4006
4035
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
|
4007
4036
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
4008
4037
|
}
|
|
@@ -4176,6 +4205,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
4176
4205
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
|
4177
4206
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
|
4178
4207
|
|
|
4208
|
+
for (auto bs : {1,2,4,8}) {
|
|
4209
|
+
for (auto nr : {1,4}) {
|
|
4210
|
+
for (uint32_t m = 0; m < 2; ++m) {
|
|
4211
|
+
for (uint32_t k = 0; k < 2; ++k) {
|
|
4212
|
+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
|
|
4213
|
+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
|
|
4214
|
+
}
|
|
4215
|
+
}
|
|
4216
|
+
}
|
|
4217
|
+
}
|
|
4218
|
+
|
|
4179
4219
|
// sycl backend will limit task global_range < MAX_INT
|
|
4180
4220
|
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
|
4181
4221
|
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
@@ -4444,6 +4484,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|
|
4444
4484
|
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
|
|
4445
4485
|
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
|
|
4446
4486
|
|
|
4487
|
+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
|
4488
|
+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
|
|
4489
|
+
|
|
4447
4490
|
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
|
|
4448
4491
|
for (ggml_type type_a : all_types) {
|
|
4449
4492
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|