whisper.rn 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/ggml.c +5349 -5349
- package/cpp/ggml.h +810 -810
- package/cpp/whisper.cpp +518 -518
- package/cpp/whisper.h +2 -2
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +3 -0
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNWhisper.js +3 -0
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +3 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +1 -3
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +2 -3
- package/src/index.ts +2 -1
- package/whisper-rn.podspec +1 -1
package/cpp/whisper.cpp
CHANGED
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
29
29
|
#endif
|
|
30
30
|
|
|
31
|
-
#if defined(
|
|
31
|
+
#if defined(WSP_GGML_BIG_ENDIAN)
|
|
32
32
|
#include <bit>
|
|
33
33
|
|
|
34
34
|
template<typename T>
|
|
@@ -42,28 +42,28 @@ float byteswap(float value) {
|
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
template<typename T>
|
|
45
|
-
static void byteswap_tensor_data(
|
|
45
|
+
static void byteswap_tensor_data(wsp_ggml_tensor * tensor) {
|
|
46
46
|
T * datum = reinterpret_cast<T *>(tensor->data);
|
|
47
|
-
for (int i = 0; i <
|
|
47
|
+
for (int i = 0; i < wsp_ggml_nelements(tensor); i++) {
|
|
48
48
|
datum[i] = byteswap(datum[i]);
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
static void byteswap_tensor(
|
|
52
|
+
static void byteswap_tensor(wsp_ggml_tensor * tensor) {
|
|
53
53
|
switch (tensor->type) {
|
|
54
|
-
case
|
|
54
|
+
case WSP_GGML_TYPE_I16: {
|
|
55
55
|
byteswap_tensor_data<int16_t>(tensor);
|
|
56
56
|
break;
|
|
57
57
|
}
|
|
58
|
-
case
|
|
59
|
-
byteswap_tensor_data<
|
|
58
|
+
case WSP_GGML_TYPE_F16: {
|
|
59
|
+
byteswap_tensor_data<wsp_ggml_fp16_t>(tensor);
|
|
60
60
|
break;
|
|
61
61
|
}
|
|
62
|
-
case
|
|
62
|
+
case WSP_GGML_TYPE_I32: {
|
|
63
63
|
byteswap_tensor_data<int32_t>(tensor);
|
|
64
64
|
break;
|
|
65
65
|
}
|
|
66
|
-
case
|
|
66
|
+
case WSP_GGML_TYPE_F32: {
|
|
67
67
|
byteswap_tensor_data<float>(tensor);
|
|
68
68
|
break;
|
|
69
69
|
}
|
|
@@ -263,8 +263,8 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
|
|
|
263
263
|
{ MODEL_LARGE, 9ull*MB },
|
|
264
264
|
};
|
|
265
265
|
|
|
266
|
-
static const std::map<
|
|
267
|
-
{
|
|
266
|
+
static const std::map<wsp_ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
267
|
+
{ WSP_GGML_TYPE_F32,
|
|
268
268
|
{
|
|
269
269
|
{ MODEL_TINY, 74ull*MB },
|
|
270
270
|
{ MODEL_BASE, 142ull*MB },
|
|
@@ -273,7 +273,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
273
273
|
{ MODEL_LARGE, 2952ull*MB },
|
|
274
274
|
},
|
|
275
275
|
},
|
|
276
|
-
{
|
|
276
|
+
{ WSP_GGML_TYPE_F16,
|
|
277
277
|
{
|
|
278
278
|
{ MODEL_TINY, 74ull*MB },
|
|
279
279
|
{ MODEL_BASE, 142ull*MB },
|
|
@@ -282,7 +282,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
282
282
|
{ MODEL_LARGE, 2952ull*MB },
|
|
283
283
|
},
|
|
284
284
|
},
|
|
285
|
-
{
|
|
285
|
+
{ WSP_GGML_TYPE_Q4_0,
|
|
286
286
|
{
|
|
287
287
|
{ MODEL_TINY, 26ull*MB },
|
|
288
288
|
{ MODEL_BASE, 50ull*MB },
|
|
@@ -291,7 +291,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
291
291
|
{ MODEL_LARGE, 940ull*MB },
|
|
292
292
|
},
|
|
293
293
|
},
|
|
294
|
-
{
|
|
294
|
+
{ WSP_GGML_TYPE_Q4_1,
|
|
295
295
|
{
|
|
296
296
|
{ MODEL_TINY, 32ull*MB },
|
|
297
297
|
{ MODEL_BASE, 58ull*MB },
|
|
@@ -300,7 +300,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
300
300
|
{ MODEL_LARGE, 1124ull*MB },
|
|
301
301
|
},
|
|
302
302
|
},
|
|
303
|
-
{
|
|
303
|
+
{ WSP_GGML_TYPE_Q5_0,
|
|
304
304
|
{
|
|
305
305
|
{ MODEL_TINY, 30ull*MB },
|
|
306
306
|
{ MODEL_BASE, 54ull*MB },
|
|
@@ -309,7 +309,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
309
309
|
{ MODEL_LARGE, 1034ull*MB },
|
|
310
310
|
},
|
|
311
311
|
},
|
|
312
|
-
{
|
|
312
|
+
{ WSP_GGML_TYPE_Q5_1,
|
|
313
313
|
{
|
|
314
314
|
{ MODEL_TINY, 32ull*MB },
|
|
315
315
|
{ MODEL_BASE, 58ull*MB },
|
|
@@ -318,7 +318,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
318
318
|
{ MODEL_LARGE, 1124ull*MB },
|
|
319
319
|
},
|
|
320
320
|
},
|
|
321
|
-
{
|
|
321
|
+
{ WSP_GGML_TYPE_Q8_0,
|
|
322
322
|
{
|
|
323
323
|
{ MODEL_TINY, 45ull*MB },
|
|
324
324
|
{ MODEL_BASE, 84ull*MB },
|
|
@@ -446,95 +446,95 @@ struct whisper_hparams {
|
|
|
446
446
|
// audio encoding layer
|
|
447
447
|
struct whisper_layer_encoder {
|
|
448
448
|
// encoder.blocks.*.attn_ln
|
|
449
|
-
struct
|
|
450
|
-
struct
|
|
449
|
+
struct wsp_ggml_tensor * attn_ln_0_w;
|
|
450
|
+
struct wsp_ggml_tensor * attn_ln_0_b;
|
|
451
451
|
|
|
452
452
|
// encoder.blocks.*.attn.out
|
|
453
|
-
struct
|
|
454
|
-
struct
|
|
453
|
+
struct wsp_ggml_tensor * attn_ln_1_w;
|
|
454
|
+
struct wsp_ggml_tensor * attn_ln_1_b;
|
|
455
455
|
|
|
456
456
|
// encoder.blocks.*.attn.query
|
|
457
|
-
struct
|
|
458
|
-
struct
|
|
457
|
+
struct wsp_ggml_tensor * attn_q_w;
|
|
458
|
+
struct wsp_ggml_tensor * attn_q_b;
|
|
459
459
|
|
|
460
460
|
// encoder.blocks.*.attn.key
|
|
461
|
-
struct
|
|
461
|
+
struct wsp_ggml_tensor * attn_k_w;
|
|
462
462
|
|
|
463
463
|
// encoder.blocks.*.attn.value
|
|
464
|
-
struct
|
|
465
|
-
struct
|
|
464
|
+
struct wsp_ggml_tensor * attn_v_w;
|
|
465
|
+
struct wsp_ggml_tensor * attn_v_b;
|
|
466
466
|
|
|
467
467
|
// encoder.blocks.*.mlp_ln
|
|
468
|
-
struct
|
|
469
|
-
struct
|
|
468
|
+
struct wsp_ggml_tensor * mlp_ln_w;
|
|
469
|
+
struct wsp_ggml_tensor * mlp_ln_b;
|
|
470
470
|
|
|
471
471
|
// encoder.blocks.*.mlp.0
|
|
472
|
-
struct
|
|
473
|
-
struct
|
|
472
|
+
struct wsp_ggml_tensor * mlp_0_w;
|
|
473
|
+
struct wsp_ggml_tensor * mlp_0_b;
|
|
474
474
|
|
|
475
475
|
// encoder.blocks.*.mlp.2
|
|
476
|
-
struct
|
|
477
|
-
struct
|
|
476
|
+
struct wsp_ggml_tensor * mlp_1_w;
|
|
477
|
+
struct wsp_ggml_tensor * mlp_1_b;
|
|
478
478
|
};
|
|
479
479
|
|
|
480
480
|
// token decoding layer
|
|
481
481
|
struct whisper_layer_decoder {
|
|
482
482
|
// decoder.blocks.*.attn_ln
|
|
483
|
-
struct
|
|
484
|
-
struct
|
|
483
|
+
struct wsp_ggml_tensor * attn_ln_0_w;
|
|
484
|
+
struct wsp_ggml_tensor * attn_ln_0_b;
|
|
485
485
|
|
|
486
486
|
// decoder.blocks.*.attn.out
|
|
487
|
-
struct
|
|
488
|
-
struct
|
|
487
|
+
struct wsp_ggml_tensor * attn_ln_1_w;
|
|
488
|
+
struct wsp_ggml_tensor * attn_ln_1_b;
|
|
489
489
|
|
|
490
490
|
// decoder.blocks.*.attn.query
|
|
491
|
-
struct
|
|
492
|
-
struct
|
|
491
|
+
struct wsp_ggml_tensor * attn_q_w;
|
|
492
|
+
struct wsp_ggml_tensor * attn_q_b;
|
|
493
493
|
|
|
494
494
|
// decoder.blocks.*.attn.key
|
|
495
|
-
struct
|
|
495
|
+
struct wsp_ggml_tensor * attn_k_w;
|
|
496
496
|
|
|
497
497
|
// decoder.blocks.*.attn.value
|
|
498
|
-
struct
|
|
499
|
-
struct
|
|
498
|
+
struct wsp_ggml_tensor * attn_v_w;
|
|
499
|
+
struct wsp_ggml_tensor * attn_v_b;
|
|
500
500
|
|
|
501
501
|
// decoder.blocks.*.cross_attn_ln
|
|
502
|
-
struct
|
|
503
|
-
struct
|
|
502
|
+
struct wsp_ggml_tensor * cross_attn_ln_0_w;
|
|
503
|
+
struct wsp_ggml_tensor * cross_attn_ln_0_b;
|
|
504
504
|
|
|
505
505
|
// decoder.blocks.*.cross_attn.out
|
|
506
|
-
struct
|
|
507
|
-
struct
|
|
506
|
+
struct wsp_ggml_tensor * cross_attn_ln_1_w;
|
|
507
|
+
struct wsp_ggml_tensor * cross_attn_ln_1_b;
|
|
508
508
|
|
|
509
509
|
// decoder.blocks.*.cross_attn.query
|
|
510
|
-
struct
|
|
511
|
-
struct
|
|
510
|
+
struct wsp_ggml_tensor * cross_attn_q_w;
|
|
511
|
+
struct wsp_ggml_tensor * cross_attn_q_b;
|
|
512
512
|
|
|
513
513
|
// decoder.blocks.*.cross_attn.key
|
|
514
|
-
struct
|
|
514
|
+
struct wsp_ggml_tensor * cross_attn_k_w;
|
|
515
515
|
|
|
516
516
|
// decoder.blocks.*.cross_attn.value
|
|
517
|
-
struct
|
|
518
|
-
struct
|
|
517
|
+
struct wsp_ggml_tensor * cross_attn_v_w;
|
|
518
|
+
struct wsp_ggml_tensor * cross_attn_v_b;
|
|
519
519
|
|
|
520
520
|
// decoder.blocks.*.mlp_ln
|
|
521
|
-
struct
|
|
522
|
-
struct
|
|
521
|
+
struct wsp_ggml_tensor * mlp_ln_w;
|
|
522
|
+
struct wsp_ggml_tensor * mlp_ln_b;
|
|
523
523
|
|
|
524
524
|
// decoder.blocks.*.mlp.0
|
|
525
|
-
struct
|
|
526
|
-
struct
|
|
525
|
+
struct wsp_ggml_tensor * mlp_0_w;
|
|
526
|
+
struct wsp_ggml_tensor * mlp_0_b;
|
|
527
527
|
|
|
528
528
|
// decoder.blocks.*.mlp.2
|
|
529
|
-
struct
|
|
530
|
-
struct
|
|
529
|
+
struct wsp_ggml_tensor * mlp_1_w;
|
|
530
|
+
struct wsp_ggml_tensor * mlp_1_b;
|
|
531
531
|
};
|
|
532
532
|
|
|
533
533
|
struct whisper_kv_cache {
|
|
534
|
-
struct
|
|
535
|
-
struct
|
|
534
|
+
struct wsp_ggml_tensor * k;
|
|
535
|
+
struct wsp_ggml_tensor * v;
|
|
536
536
|
|
|
537
|
-
struct
|
|
537
|
+
struct wsp_ggml_context * ctx;
|
|
538
538
|
|
|
539
539
|
std::vector<uint8_t> buf;
|
|
540
540
|
|
|
@@ -548,42 +548,42 @@ struct whisper_model {
|
|
|
548
548
|
whisper_filters filters;
|
|
549
549
|
|
|
550
550
|
// encoder.positional_embedding
|
|
551
|
-
struct
|
|
551
|
+
struct wsp_ggml_tensor * e_pe;
|
|
552
552
|
|
|
553
553
|
// encoder.conv1
|
|
554
|
-
struct
|
|
555
|
-
struct
|
|
554
|
+
struct wsp_ggml_tensor * e_conv_1_w;
|
|
555
|
+
struct wsp_ggml_tensor * e_conv_1_b;
|
|
556
556
|
|
|
557
557
|
// encoder.conv2
|
|
558
|
-
struct
|
|
559
|
-
struct
|
|
558
|
+
struct wsp_ggml_tensor * e_conv_2_w;
|
|
559
|
+
struct wsp_ggml_tensor * e_conv_2_b;
|
|
560
560
|
|
|
561
561
|
// encoder.ln_post
|
|
562
|
-
struct
|
|
563
|
-
struct
|
|
562
|
+
struct wsp_ggml_tensor * e_ln_w;
|
|
563
|
+
struct wsp_ggml_tensor * e_ln_b;
|
|
564
564
|
|
|
565
565
|
// decoder.positional_embedding
|
|
566
|
-
struct
|
|
566
|
+
struct wsp_ggml_tensor * d_pe;
|
|
567
567
|
|
|
568
568
|
// decoder.token_embedding
|
|
569
|
-
struct
|
|
569
|
+
struct wsp_ggml_tensor * d_te;
|
|
570
570
|
|
|
571
571
|
// decoder.ln
|
|
572
|
-
struct
|
|
573
|
-
struct
|
|
572
|
+
struct wsp_ggml_tensor * d_ln_w;
|
|
573
|
+
struct wsp_ggml_tensor * d_ln_b;
|
|
574
574
|
|
|
575
575
|
std::vector<whisper_layer_encoder> layers_encoder;
|
|
576
576
|
std::vector<whisper_layer_decoder> layers_decoder;
|
|
577
577
|
|
|
578
578
|
// context
|
|
579
|
-
struct
|
|
579
|
+
struct wsp_ggml_context * ctx;
|
|
580
580
|
|
|
581
581
|
// the model memory buffer is read-only and can be shared between processors
|
|
582
582
|
std::vector<uint8_t> * buf;
|
|
583
583
|
|
|
584
584
|
// tensors
|
|
585
585
|
int n_loaded;
|
|
586
|
-
std::map<std::string, struct
|
|
586
|
+
std::map<std::string, struct wsp_ggml_tensor *> tensors;
|
|
587
587
|
};
|
|
588
588
|
|
|
589
589
|
struct whisper_sequence {
|
|
@@ -678,15 +678,15 @@ struct whisper_state {
|
|
|
678
678
|
// [EXPERIMENTAL] speed-up techniques
|
|
679
679
|
int32_t exp_n_audio_ctx = 0; // 0 - use default
|
|
680
680
|
|
|
681
|
-
void use_buf(struct
|
|
681
|
+
void use_buf(struct wsp_ggml_context * ctx, int i) {
|
|
682
682
|
#if defined(WHISPER_USE_SCRATCH)
|
|
683
683
|
size_t last_size = 0;
|
|
684
684
|
|
|
685
685
|
if (i == -1) {
|
|
686
|
-
last_size =
|
|
686
|
+
last_size = wsp_ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
|
687
687
|
} else {
|
|
688
688
|
auto & buf = buf_scratch[i];
|
|
689
|
-
last_size =
|
|
689
|
+
last_size = wsp_ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
|
|
690
690
|
}
|
|
691
691
|
|
|
692
692
|
if (buf_last >= 0) {
|
|
@@ -714,8 +714,8 @@ struct whisper_context {
|
|
|
714
714
|
int64_t t_load_us = 0;
|
|
715
715
|
int64_t t_start_us = 0;
|
|
716
716
|
|
|
717
|
-
|
|
718
|
-
|
|
717
|
+
wsp_ggml_type wtype = wsp_ggml_type::WSP_GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
|
|
718
|
+
wsp_ggml_type itype = wsp_ggml_type::WSP_GGML_TYPE_F16; // intermediate type (FP32 or FP16)
|
|
719
719
|
|
|
720
720
|
whisper_model model;
|
|
721
721
|
whisper_vocab vocab;
|
|
@@ -749,17 +749,17 @@ static bool kv_cache_init(
|
|
|
749
749
|
const struct whisper_hparams & hparams,
|
|
750
750
|
const size_t mem_bytes,
|
|
751
751
|
struct whisper_kv_cache & cache,
|
|
752
|
-
|
|
752
|
+
wsp_ggml_type wtype,
|
|
753
753
|
int n_ctx) {
|
|
754
754
|
cache.buf.resize(mem_bytes);
|
|
755
755
|
|
|
756
|
-
struct
|
|
756
|
+
struct wsp_ggml_init_params params = {
|
|
757
757
|
/*.mem_size =*/ cache.buf.size(),
|
|
758
758
|
/*.mem_buffer =*/ cache.buf.data(),
|
|
759
759
|
/*.no_alloc =*/ false,
|
|
760
760
|
};
|
|
761
761
|
|
|
762
|
-
cache.ctx =
|
|
762
|
+
cache.ctx = wsp_ggml_init(params);
|
|
763
763
|
|
|
764
764
|
if (!cache.ctx) {
|
|
765
765
|
log("%s: failed to allocate memory for kv cache\n", __func__);
|
|
@@ -772,8 +772,8 @@ static bool kv_cache_init(
|
|
|
772
772
|
const int n_mem = n_text_layer*n_ctx;
|
|
773
773
|
const int n_elements = n_text_state*n_mem;
|
|
774
774
|
|
|
775
|
-
cache.k =
|
|
776
|
-
cache.v =
|
|
775
|
+
cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
776
|
+
cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
777
777
|
|
|
778
778
|
return true;
|
|
779
779
|
}
|
|
@@ -781,36 +781,36 @@ static bool kv_cache_init(
|
|
|
781
781
|
static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|
782
782
|
WHISPER_ASSERT(cache.ctx);
|
|
783
783
|
|
|
784
|
-
const int n_elements =
|
|
785
|
-
WHISPER_ASSERT(n_elements ==
|
|
784
|
+
const int n_elements = wsp_ggml_nelements(cache.k);
|
|
785
|
+
WHISPER_ASSERT(n_elements == wsp_ggml_nelements(cache.v));
|
|
786
786
|
|
|
787
|
-
const
|
|
787
|
+
const wsp_ggml_type wtype = cache.k->type;
|
|
788
788
|
WHISPER_ASSERT(wtype == cache.v->type);
|
|
789
789
|
|
|
790
|
-
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*
|
|
790
|
+
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*wsp_ggml_type_sizef(wtype));
|
|
791
791
|
|
|
792
|
-
struct
|
|
792
|
+
struct wsp_ggml_init_params params = {
|
|
793
793
|
/*.mem_size =*/ cache.buf.size(),
|
|
794
794
|
/*.mem_buffer =*/ cache.buf.data(),
|
|
795
795
|
/*.no_alloc =*/ false,
|
|
796
796
|
};
|
|
797
797
|
|
|
798
|
-
cache.ctx =
|
|
798
|
+
cache.ctx = wsp_ggml_init(params);
|
|
799
799
|
|
|
800
800
|
if (!cache.ctx) {
|
|
801
801
|
log("%s: failed to allocate memory for kv cache\n", __func__);
|
|
802
802
|
return false;
|
|
803
803
|
}
|
|
804
804
|
|
|
805
|
-
cache.k =
|
|
806
|
-
cache.v =
|
|
805
|
+
cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
806
|
+
cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
807
807
|
|
|
808
808
|
return true;
|
|
809
809
|
}
|
|
810
810
|
|
|
811
811
|
static void kv_cache_free(struct whisper_kv_cache & cache) {
|
|
812
812
|
if (cache.ctx) {
|
|
813
|
-
|
|
813
|
+
wsp_ggml_free(cache.ctx);
|
|
814
814
|
cache.ctx = nullptr;
|
|
815
815
|
}
|
|
816
816
|
}
|
|
@@ -829,7 +829,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
|
|
|
829
829
|
static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
|
|
830
830
|
log("%s: loading model\n", __func__);
|
|
831
831
|
|
|
832
|
-
const int64_t t_start_us =
|
|
832
|
+
const int64_t t_start_us = wsp_ggml_time_us();
|
|
833
833
|
|
|
834
834
|
wctx.t_start_us = t_start_us;
|
|
835
835
|
|
|
@@ -840,7 +840,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
840
840
|
{
|
|
841
841
|
uint32_t magic;
|
|
842
842
|
read_safe(loader, magic);
|
|
843
|
-
if (magic !=
|
|
843
|
+
if (magic != WSP_GGML_FILE_MAGIC) {
|
|
844
844
|
log("%s: invalid model data (bad magic)\n", __func__);
|
|
845
845
|
return false;
|
|
846
846
|
}
|
|
@@ -884,14 +884,14 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
884
884
|
model.type = e_model::MODEL_LARGE;
|
|
885
885
|
}
|
|
886
886
|
|
|
887
|
-
const int32_t qntvr = hparams.ftype /
|
|
887
|
+
const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR;
|
|
888
888
|
|
|
889
|
-
hparams.ftype %=
|
|
889
|
+
hparams.ftype %= WSP_GGML_QNT_VERSION_FACTOR;
|
|
890
890
|
|
|
891
891
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
|
892
892
|
// in order to save memory and also to speed up the computation
|
|
893
|
-
wctx.wtype =
|
|
894
|
-
if (wctx.wtype ==
|
|
893
|
+
wctx.wtype = wsp_ggml_ftype_to_wsp_ggml_type((wsp_ggml_ftype) (model.hparams.ftype));
|
|
894
|
+
if (wctx.wtype == WSP_GGML_TYPE_COUNT) {
|
|
895
895
|
log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
|
896
896
|
return false;
|
|
897
897
|
}
|
|
@@ -1033,8 +1033,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1033
1033
|
|
|
1034
1034
|
size_t ctx_size = 0;
|
|
1035
1035
|
|
|
1036
|
-
const
|
|
1037
|
-
const
|
|
1036
|
+
const wsp_ggml_type wtype = wctx.wtype;
|
|
1037
|
+
const wsp_ggml_type vtype = wctx.wtype == WSP_GGML_TYPE_F32 ? WSP_GGML_TYPE_F32 : WSP_GGML_TYPE_F16; // conv type
|
|
1038
1038
|
|
|
1039
1039
|
{
|
|
1040
1040
|
const auto & hparams = model.hparams;
|
|
@@ -1053,92 +1053,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1053
1053
|
|
|
1054
1054
|
// encoder
|
|
1055
1055
|
{
|
|
1056
|
-
ctx_size += n_audio_ctx*n_audio_state*
|
|
1056
|
+
ctx_size += n_audio_ctx*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_pe;
|
|
1057
1057
|
|
|
1058
|
-
ctx_size += 3*n_mels*n_audio_state*
|
|
1059
|
-
ctx_size += n_audio_state*
|
|
1058
|
+
ctx_size += 3*n_mels*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_1_w
|
|
1059
|
+
ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_1_b
|
|
1060
1060
|
|
|
1061
|
-
ctx_size += 3*n_audio_state*n_audio_state*
|
|
1062
|
-
ctx_size += n_audio_state*
|
|
1061
|
+
ctx_size += 3*n_audio_state*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_2_w
|
|
1062
|
+
ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_2_b
|
|
1063
1063
|
|
|
1064
|
-
ctx_size += n_audio_state*
|
|
1065
|
-
ctx_size += n_audio_state*
|
|
1064
|
+
ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_w;
|
|
1065
|
+
ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_b;
|
|
1066
1066
|
}
|
|
1067
1067
|
|
|
1068
1068
|
// decoder
|
|
1069
1069
|
{
|
|
1070
|
-
ctx_size += n_text_ctx*n_text_state*
|
|
1070
|
+
ctx_size += n_text_ctx*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_pe;
|
|
1071
1071
|
|
|
1072
|
-
ctx_size += n_vocab*n_text_state*
|
|
1072
|
+
ctx_size += n_vocab*n_text_state*wsp_ggml_type_sizef(wtype); // d_te;
|
|
1073
1073
|
|
|
1074
|
-
ctx_size += n_text_state*
|
|
1075
|
-
ctx_size += n_text_state*
|
|
1074
|
+
ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_w;
|
|
1075
|
+
ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_b;
|
|
1076
1076
|
}
|
|
1077
1077
|
|
|
1078
1078
|
// encoder layers
|
|
1079
1079
|
{
|
|
1080
|
-
ctx_size += n_audio_layer*(n_audio_state*
|
|
1081
|
-
ctx_size += n_audio_layer*(n_audio_state*
|
|
1080
|
+
ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
|
|
1081
|
+
ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
|
|
1082
1082
|
|
|
1083
|
-
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*
|
|
1084
|
-
ctx_size += n_audio_layer*( 4*n_audio_state*
|
|
1083
|
+
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w
|
|
1084
|
+
ctx_size += n_audio_layer*( 4*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
|
|
1085
1085
|
|
|
1086
|
-
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*
|
|
1087
|
-
ctx_size += n_audio_layer*( n_audio_state*
|
|
1086
|
+
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w
|
|
1087
|
+
ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
|
|
1088
1088
|
|
|
1089
|
-
ctx_size += n_audio_layer*(n_audio_state*
|
|
1090
|
-
ctx_size += n_audio_layer*(n_audio_state*
|
|
1089
|
+
ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
|
|
1090
|
+
ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
|
|
1091
1091
|
|
|
1092
|
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
|
1093
|
-
ctx_size += n_audio_layer*( n_audio_state*
|
|
1092
|
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_q_w
|
|
1093
|
+
ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
|
|
1094
1094
|
|
|
1095
|
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
|
1095
|
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
|
|
1096
1096
|
|
|
1097
|
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
|
1098
|
-
ctx_size += n_audio_layer*( n_audio_state*
|
|
1097
|
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_v_w
|
|
1098
|
+
ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
|
|
1099
1099
|
|
|
1100
|
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
|
1101
|
-
ctx_size += n_audio_layer*( n_audio_state*
|
|
1100
|
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w
|
|
1101
|
+
ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
|
|
1102
1102
|
}
|
|
1103
1103
|
|
|
1104
1104
|
// decoder layers
|
|
1105
1105
|
{
|
|
1106
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1107
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1106
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
|
|
1107
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
|
|
1108
1108
|
|
|
1109
|
-
ctx_size += n_text_layer*(4*n_text_state*n_text_state*
|
|
1110
|
-
ctx_size += n_text_layer*( 4*n_text_state*
|
|
1109
|
+
ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w
|
|
1110
|
+
ctx_size += n_text_layer*( 4*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
|
|
1111
1111
|
|
|
1112
|
-
ctx_size += n_text_layer*(4*n_text_state*n_text_state*
|
|
1113
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1112
|
+
ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w
|
|
1113
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
|
|
1114
1114
|
|
|
1115
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1116
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1115
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
|
|
1116
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
|
|
1117
1117
|
|
|
1118
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1119
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1118
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_q_w
|
|
1119
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
|
|
1120
1120
|
|
|
1121
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1121
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
|
|
1122
1122
|
|
|
1123
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1124
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1123
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_v_w
|
|
1124
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
|
|
1125
1125
|
|
|
1126
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1127
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1126
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w
|
|
1127
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
|
|
1128
1128
|
//
|
|
1129
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1130
|
-
ctx_size += n_text_layer*(n_text_state*
|
|
1129
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_w
|
|
1130
|
+
ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_b
|
|
1131
1131
|
|
|
1132
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1133
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1132
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_q_w
|
|
1133
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_q_b
|
|
1134
1134
|
|
|
1135
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1135
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_k_w
|
|
1136
1136
|
|
|
1137
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1138
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1137
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_v_w
|
|
1138
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_v_b
|
|
1139
1139
|
|
|
1140
|
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
|
1141
|
-
ctx_size += n_text_layer*( n_text_state*
|
|
1140
|
+
ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_ln_1_w
|
|
1141
|
+
ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_1_b
|
|
1142
1142
|
}
|
|
1143
1143
|
|
|
1144
1144
|
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
|
@@ -1148,15 +1148,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1148
1148
|
|
|
1149
1149
|
// create the ggml context
|
|
1150
1150
|
{
|
|
1151
|
-
struct
|
|
1151
|
+
struct wsp_ggml_init_params params = {
|
|
1152
1152
|
/*.mem_size =*/ wctx.model.buf->size(),
|
|
1153
1153
|
/*.mem_buffer =*/ wctx.model.buf->data(),
|
|
1154
1154
|
/*.no_alloc =*/ false,
|
|
1155
1155
|
};
|
|
1156
1156
|
|
|
1157
|
-
model.ctx =
|
|
1157
|
+
model.ctx = wsp_ggml_init(params);
|
|
1158
1158
|
if (!model.ctx) {
|
|
1159
|
-
log("%s:
|
|
1159
|
+
log("%s: wsp_ggml_init() failed\n", __func__);
|
|
1160
1160
|
return false;
|
|
1161
1161
|
}
|
|
1162
1162
|
}
|
|
@@ -1184,16 +1184,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1184
1184
|
|
|
1185
1185
|
// encoder
|
|
1186
1186
|
{
|
|
1187
|
-
model.e_pe =
|
|
1187
|
+
model.e_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_audio_state, n_audio_ctx);
|
|
1188
1188
|
|
|
1189
|
-
model.e_conv_1_w =
|
|
1190
|
-
model.e_conv_1_b =
|
|
1189
|
+
model.e_conv_1_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
|
|
1190
|
+
model.e_conv_1_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
|
|
1191
1191
|
|
|
1192
|
-
model.e_conv_2_w =
|
|
1193
|
-
model.e_conv_2_b =
|
|
1192
|
+
model.e_conv_2_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
|
|
1193
|
+
model.e_conv_2_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
|
|
1194
1194
|
|
|
1195
|
-
model.e_ln_w =
|
|
1196
|
-
model.e_ln_b =
|
|
1195
|
+
model.e_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1196
|
+
model.e_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1197
1197
|
|
|
1198
1198
|
// map by name
|
|
1199
1199
|
model.tensors["encoder.positional_embedding"] = model.e_pe;
|
|
@@ -1210,28 +1210,28 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1210
1210
|
for (int i = 0; i < n_audio_layer; ++i) {
|
|
1211
1211
|
auto & layer = model.layers_encoder[i];
|
|
1212
1212
|
|
|
1213
|
-
layer.mlp_ln_w =
|
|
1214
|
-
layer.mlp_ln_b =
|
|
1213
|
+
layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1214
|
+
layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1215
1215
|
|
|
1216
|
-
layer.mlp_0_w =
|
|
1217
|
-
layer.mlp_0_b =
|
|
1216
|
+
layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
|
|
1217
|
+
layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_audio_state);
|
|
1218
1218
|
|
|
1219
|
-
layer.mlp_1_w =
|
|
1220
|
-
layer.mlp_1_b =
|
|
1219
|
+
layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
|
|
1220
|
+
layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1221
1221
|
|
|
1222
|
-
layer.attn_ln_0_w =
|
|
1223
|
-
layer.attn_ln_0_b =
|
|
1222
|
+
layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1223
|
+
layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1224
1224
|
|
|
1225
|
-
layer.attn_q_w =
|
|
1226
|
-
layer.attn_q_b =
|
|
1225
|
+
layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
|
1226
|
+
layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1227
1227
|
|
|
1228
|
-
layer.attn_k_w =
|
|
1228
|
+
layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
|
1229
1229
|
|
|
1230
|
-
layer.attn_v_w =
|
|
1231
|
-
layer.attn_v_b =
|
|
1230
|
+
layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
|
1231
|
+
layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1232
1232
|
|
|
1233
|
-
layer.attn_ln_1_w =
|
|
1234
|
-
layer.attn_ln_1_b =
|
|
1233
|
+
layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
|
1234
|
+
layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
|
|
1235
1235
|
|
|
1236
1236
|
// map by name
|
|
1237
1237
|
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
|
|
@@ -1261,12 +1261,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1261
1261
|
|
|
1262
1262
|
// decoder
|
|
1263
1263
|
{
|
|
1264
|
-
model.d_pe =
|
|
1264
|
+
model.d_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_text_state, n_text_ctx);
|
|
1265
1265
|
|
|
1266
|
-
model.d_te =
|
|
1266
|
+
model.d_te = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
|
|
1267
1267
|
|
|
1268
|
-
model.d_ln_w =
|
|
1269
|
-
model.d_ln_b =
|
|
1268
|
+
model.d_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1269
|
+
model.d_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1270
1270
|
|
|
1271
1271
|
// map by name
|
|
1272
1272
|
model.tensors["decoder.positional_embedding"] = model.d_pe;
|
|
@@ -1279,42 +1279,42 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1279
1279
|
for (int i = 0; i < n_text_layer; ++i) {
|
|
1280
1280
|
auto & layer = model.layers_decoder[i];
|
|
1281
1281
|
|
|
1282
|
-
layer.mlp_ln_w =
|
|
1283
|
-
layer.mlp_ln_b =
|
|
1282
|
+
layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1283
|
+
layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1284
1284
|
|
|
1285
|
-
layer.mlp_0_w =
|
|
1286
|
-
layer.mlp_0_b =
|
|
1285
|
+
layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
|
|
1286
|
+
layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_text_state);
|
|
1287
1287
|
|
|
1288
|
-
layer.mlp_1_w =
|
|
1289
|
-
layer.mlp_1_b =
|
|
1288
|
+
layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
|
|
1289
|
+
layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1290
1290
|
|
|
1291
|
-
layer.attn_ln_0_w =
|
|
1292
|
-
layer.attn_ln_0_b =
|
|
1291
|
+
layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1292
|
+
layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1293
1293
|
|
|
1294
|
-
layer.attn_q_w =
|
|
1295
|
-
layer.attn_q_b =
|
|
1294
|
+
layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1295
|
+
layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1296
1296
|
|
|
1297
|
-
layer.attn_k_w =
|
|
1297
|
+
layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1298
1298
|
|
|
1299
|
-
layer.attn_v_w =
|
|
1300
|
-
layer.attn_v_b =
|
|
1299
|
+
layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1300
|
+
layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1301
1301
|
|
|
1302
|
-
layer.attn_ln_1_w =
|
|
1303
|
-
layer.attn_ln_1_b =
|
|
1302
|
+
layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1303
|
+
layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1304
1304
|
|
|
1305
|
-
layer.cross_attn_ln_0_w =
|
|
1306
|
-
layer.cross_attn_ln_0_b =
|
|
1305
|
+
layer.cross_attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1306
|
+
layer.cross_attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1307
1307
|
|
|
1308
|
-
layer.cross_attn_q_w =
|
|
1309
|
-
layer.cross_attn_q_b =
|
|
1308
|
+
layer.cross_attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1309
|
+
layer.cross_attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1310
1310
|
|
|
1311
|
-
layer.cross_attn_k_w =
|
|
1311
|
+
layer.cross_attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1312
1312
|
|
|
1313
|
-
layer.cross_attn_v_w =
|
|
1314
|
-
layer.cross_attn_v_b =
|
|
1313
|
+
layer.cross_attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1314
|
+
layer.cross_attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1315
1315
|
|
|
1316
|
-
layer.cross_attn_ln_1_w =
|
|
1317
|
-
layer.cross_attn_ln_1_b =
|
|
1316
|
+
layer.cross_attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
|
1317
|
+
layer.cross_attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
|
|
1318
1318
|
|
|
1319
1319
|
// map by name
|
|
1320
1320
|
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
|
|
@@ -1394,7 +1394,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1394
1394
|
}
|
|
1395
1395
|
|
|
1396
1396
|
auto tensor = model.tensors[name.data()];
|
|
1397
|
-
if (
|
|
1397
|
+
if (wsp_ggml_nelements(tensor) != nelements) {
|
|
1398
1398
|
log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
|
1399
1399
|
log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
|
|
1400
1400
|
__func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
|
|
@@ -1407,19 +1407,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1407
1407
|
return false;
|
|
1408
1408
|
}
|
|
1409
1409
|
|
|
1410
|
-
const size_t bpe =
|
|
1410
|
+
const size_t bpe = wsp_ggml_type_size(wsp_ggml_type(ttype));
|
|
1411
1411
|
|
|
1412
|
-
if ((nelements*bpe)/
|
|
1412
|
+
if ((nelements*bpe)/wsp_ggml_blck_size(tensor->type) != wsp_ggml_nbytes(tensor)) {
|
|
1413
1413
|
log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
|
1414
|
-
__func__, name.data(),
|
|
1414
|
+
__func__, name.data(), wsp_ggml_nbytes(tensor), nelements*bpe);
|
|
1415
1415
|
return false;
|
|
1416
1416
|
}
|
|
1417
1417
|
|
|
1418
|
-
loader->read(loader->context, tensor->data,
|
|
1418
|
+
loader->read(loader->context, tensor->data, wsp_ggml_nbytes(tensor));
|
|
1419
1419
|
BYTESWAP_TENSOR(tensor);
|
|
1420
1420
|
|
|
1421
|
-
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2],
|
|
1422
|
-
total_size +=
|
|
1421
|
+
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], wsp_ggml_type_name((wsp_ggml_type) ttype), wsp_ggml_nbytes(tensor)/1024.0/1024.0);
|
|
1422
|
+
total_size += wsp_ggml_nbytes(tensor);
|
|
1423
1423
|
model.n_loaded++;
|
|
1424
1424
|
}
|
|
1425
1425
|
|
|
@@ -1433,7 +1433,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1433
1433
|
}
|
|
1434
1434
|
}
|
|
1435
1435
|
|
|
1436
|
-
wctx.t_load_us =
|
|
1436
|
+
wctx.t_load_us = wsp_ggml_time_us() - t_start_us;
|
|
1437
1437
|
|
|
1438
1438
|
return true;
|
|
1439
1439
|
}
|
|
@@ -1454,7 +1454,7 @@ static bool whisper_encode_internal(
|
|
|
1454
1454
|
const int mel_offset,
|
|
1455
1455
|
const int n_threads){
|
|
1456
1456
|
|
|
1457
|
-
const int64_t t_start_us =
|
|
1457
|
+
const int64_t t_start_us = wsp_ggml_time_us();
|
|
1458
1458
|
|
|
1459
1459
|
const auto & model = wctx.model;
|
|
1460
1460
|
const auto & mel_inp = wstate.mel;
|
|
@@ -1468,21 +1468,21 @@ static bool whisper_encode_internal(
|
|
|
1468
1468
|
const int n_mels = hparams.n_mels;
|
|
1469
1469
|
assert(mel_inp.n_mel == n_mels);
|
|
1470
1470
|
|
|
1471
|
-
struct
|
|
1471
|
+
struct wsp_ggml_init_params params = {
|
|
1472
1472
|
/*.mem_size =*/ wstate.buf_compute.size(),
|
|
1473
1473
|
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
|
1474
1474
|
/*.no_alloc =*/ false,
|
|
1475
1475
|
};
|
|
1476
1476
|
|
|
1477
|
-
struct
|
|
1477
|
+
struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
|
|
1478
1478
|
|
|
1479
1479
|
wstate.use_buf(ctx0, 0);
|
|
1480
1480
|
|
|
1481
|
-
struct
|
|
1482
|
-
assert(mel->type ==
|
|
1481
|
+
struct wsp_ggml_tensor * mel = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, 2*n_ctx, n_mels);
|
|
1482
|
+
assert(mel->type == WSP_GGML_TYPE_F32);
|
|
1483
1483
|
{
|
|
1484
1484
|
float * dst = (float *) mel->data;
|
|
1485
|
-
memset(dst, 0,
|
|
1485
|
+
memset(dst, 0, wsp_ggml_nbytes(mel));
|
|
1486
1486
|
|
|
1487
1487
|
const int i0 = std::min(mel_offset, mel_inp.n_len);
|
|
1488
1488
|
const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
|
|
@@ -1494,7 +1494,7 @@ static bool whisper_encode_internal(
|
|
|
1494
1494
|
}
|
|
1495
1495
|
}
|
|
1496
1496
|
|
|
1497
|
-
struct
|
|
1497
|
+
struct wsp_ggml_tensor * cur;
|
|
1498
1498
|
|
|
1499
1499
|
#ifndef WHISPER_USE_COREML
|
|
1500
1500
|
const bool use_coreml = false;
|
|
@@ -1513,25 +1513,25 @@ static bool whisper_encode_internal(
|
|
|
1513
1513
|
{
|
|
1514
1514
|
wstate.use_buf(ctx0, 1);
|
|
1515
1515
|
|
|
1516
|
-
cur =
|
|
1517
|
-
cur =
|
|
1518
|
-
|
|
1516
|
+
cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
|
|
1517
|
+
cur = wsp_ggml_add(ctx0,
|
|
1518
|
+
wsp_ggml_repeat(ctx0,
|
|
1519
1519
|
model.e_conv_1_b,
|
|
1520
1520
|
cur),
|
|
1521
1521
|
cur);
|
|
1522
1522
|
|
|
1523
|
-
cur =
|
|
1523
|
+
cur = wsp_ggml_gelu(ctx0, cur);
|
|
1524
1524
|
|
|
1525
1525
|
wstate.use_buf(ctx0, 0);
|
|
1526
1526
|
|
|
1527
|
-
cur =
|
|
1528
|
-
cur =
|
|
1529
|
-
|
|
1527
|
+
cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
|
|
1528
|
+
cur = wsp_ggml_add(ctx0,
|
|
1529
|
+
wsp_ggml_repeat(ctx0,
|
|
1530
1530
|
model.e_conv_2_b,
|
|
1531
1531
|
cur),
|
|
1532
1532
|
cur);
|
|
1533
1533
|
|
|
1534
|
-
cur =
|
|
1534
|
+
cur = wsp_ggml_gelu(ctx0, cur);
|
|
1535
1535
|
}
|
|
1536
1536
|
|
|
1537
1537
|
wstate.use_buf(ctx0, 3);
|
|
@@ -1544,25 +1544,25 @@ static bool whisper_encode_internal(
|
|
|
1544
1544
|
//iter = (iter + 1) % n_iter;
|
|
1545
1545
|
|
|
1546
1546
|
//if (iter == 0) {
|
|
1547
|
-
// memset(model.memory_cross_k->data, 0,
|
|
1548
|
-
// memset(model.memory_cross_v->data, 0,
|
|
1547
|
+
// memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k));
|
|
1548
|
+
// memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v));
|
|
1549
1549
|
//}
|
|
1550
1550
|
|
|
1551
1551
|
static int iter = 0;
|
|
1552
1552
|
|
|
1553
|
-
const size_t e_pe_stride = model.e_pe->ne[0]*
|
|
1554
|
-
const size_t e_pe_offset = model.e_pe->ne[0]*
|
|
1553
|
+
const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe);
|
|
1554
|
+
const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter;
|
|
1555
1555
|
|
|
1556
|
-
struct
|
|
1556
|
+
struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
|
1557
1557
|
|
|
1558
|
-
cur =
|
|
1558
|
+
cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_transpose(ctx0, cur));
|
|
1559
1559
|
|
|
1560
1560
|
// ===================================================================
|
|
1561
1561
|
|
|
1562
1562
|
// original:
|
|
1563
|
-
//cur =
|
|
1563
|
+
//cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur));
|
|
1564
1564
|
|
|
1565
|
-
struct
|
|
1565
|
+
struct wsp_ggml_tensor * inpL = cur;
|
|
1566
1566
|
|
|
1567
1567
|
for (int il = 0; il < n_layer; ++il) {
|
|
1568
1568
|
const auto & layer = model.layers_encoder[il];
|
|
@@ -1571,45 +1571,45 @@ static bool whisper_encode_internal(
|
|
|
1571
1571
|
{
|
|
1572
1572
|
wstate.use_buf(ctx0, 0);
|
|
1573
1573
|
|
|
1574
|
-
cur =
|
|
1574
|
+
cur = wsp_ggml_norm(ctx0, inpL);
|
|
1575
1575
|
|
|
1576
1576
|
// cur = ln_0_w*cur + ln_0_b
|
|
1577
|
-
cur =
|
|
1578
|
-
|
|
1579
|
-
|
|
1577
|
+
cur = wsp_ggml_add(ctx0,
|
|
1578
|
+
wsp_ggml_mul(ctx0,
|
|
1579
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
|
|
1580
1580
|
cur),
|
|
1581
|
-
|
|
1581
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
|
|
1582
1582
|
}
|
|
1583
1583
|
|
|
1584
1584
|
// self-attention
|
|
1585
1585
|
{
|
|
1586
1586
|
wstate.use_buf(ctx0, 1);
|
|
1587
1587
|
|
|
1588
|
-
struct
|
|
1588
|
+
struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
|
|
1589
1589
|
layer.attn_q_w,
|
|
1590
1590
|
cur);
|
|
1591
1591
|
|
|
1592
|
-
Qcur =
|
|
1593
|
-
|
|
1592
|
+
Qcur = wsp_ggml_add(ctx0,
|
|
1593
|
+
wsp_ggml_repeat(ctx0,
|
|
1594
1594
|
layer.attn_q_b,
|
|
1595
1595
|
Qcur),
|
|
1596
1596
|
Qcur);
|
|
1597
1597
|
|
|
1598
|
-
//Qcur =
|
|
1598
|
+
//Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1599
1599
|
|
|
1600
1600
|
// note: no bias for Key
|
|
1601
|
-
struct
|
|
1601
|
+
struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
|
|
1602
1602
|
layer.attn_k_w,
|
|
1603
1603
|
cur);
|
|
1604
1604
|
|
|
1605
|
-
//Kcur =
|
|
1605
|
+
//Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1606
1606
|
|
|
1607
|
-
struct
|
|
1607
|
+
struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
|
|
1608
1608
|
layer.attn_v_w,
|
|
1609
1609
|
cur);
|
|
1610
1610
|
|
|
1611
|
-
Vcur =
|
|
1612
|
-
|
|
1611
|
+
Vcur = wsp_ggml_add(ctx0,
|
|
1612
|
+
wsp_ggml_repeat(ctx0,
|
|
1613
1613
|
layer.attn_v_b,
|
|
1614
1614
|
Vcur),
|
|
1615
1615
|
Vcur);
|
|
@@ -1619,98 +1619,98 @@ static bool whisper_encode_internal(
|
|
|
1619
1619
|
wstate.use_buf(ctx0, 0);
|
|
1620
1620
|
|
|
1621
1621
|
#ifdef WHISPER_USE_FLASH_ATTN
|
|
1622
|
-
struct
|
|
1623
|
-
|
|
1624
|
-
|
|
1622
|
+
struct wsp_ggml_tensor * Q =
|
|
1623
|
+
wsp_ggml_permute(ctx0,
|
|
1624
|
+
wsp_ggml_cpy(ctx0,
|
|
1625
1625
|
Qcur,
|
|
1626
|
-
|
|
1626
|
+
wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
|
1627
1627
|
0, 2, 1, 3);
|
|
1628
1628
|
|
|
1629
|
-
struct
|
|
1630
|
-
|
|
1631
|
-
|
|
1629
|
+
struct wsp_ggml_tensor * K =
|
|
1630
|
+
wsp_ggml_permute(ctx0,
|
|
1631
|
+
wsp_ggml_cpy(ctx0,
|
|
1632
1632
|
Kcur,
|
|
1633
|
-
|
|
1633
|
+
wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
|
1634
1634
|
0, 2, 1, 3);
|
|
1635
1635
|
|
|
1636
|
-
struct
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1636
|
+
struct wsp_ggml_tensor * V =
|
|
1637
|
+
wsp_ggml_cpy(ctx0,
|
|
1638
|
+
wsp_ggml_permute(ctx0,
|
|
1639
|
+
wsp_ggml_reshape_3d(ctx0,
|
|
1640
1640
|
Vcur,
|
|
1641
1641
|
n_state/n_head, n_head, n_ctx),
|
|
1642
1642
|
1, 2, 0, 3),
|
|
1643
|
-
|
|
1643
|
+
wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
|
|
1644
1644
|
|
|
1645
|
-
struct
|
|
1645
|
+
struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false);
|
|
1646
1646
|
#else
|
|
1647
|
-
struct
|
|
1648
|
-
|
|
1649
|
-
|
|
1647
|
+
struct wsp_ggml_tensor * Q =
|
|
1648
|
+
wsp_ggml_permute(ctx0,
|
|
1649
|
+
wsp_ggml_cpy(ctx0,
|
|
1650
1650
|
Qcur,
|
|
1651
|
-
|
|
1651
|
+
wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
|
|
1652
1652
|
0, 2, 1, 3);
|
|
1653
1653
|
|
|
1654
|
-
struct
|
|
1655
|
-
|
|
1656
|
-
|
|
1654
|
+
struct wsp_ggml_tensor * K =
|
|
1655
|
+
wsp_ggml_permute(ctx0,
|
|
1656
|
+
wsp_ggml_cpy(ctx0,
|
|
1657
1657
|
Kcur,
|
|
1658
|
-
|
|
1658
|
+
wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
|
1659
1659
|
0, 2, 1, 3);
|
|
1660
1660
|
|
|
1661
1661
|
// K * Q
|
|
1662
|
-
struct
|
|
1662
|
+
struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
|
|
1663
1663
|
|
|
1664
|
-
struct
|
|
1665
|
-
|
|
1664
|
+
struct wsp_ggml_tensor * KQ_scaled =
|
|
1665
|
+
wsp_ggml_scale_inplace(ctx0,
|
|
1666
1666
|
KQ,
|
|
1667
|
-
|
|
1667
|
+
wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
1668
1668
|
);
|
|
1669
1669
|
|
|
1670
|
-
struct
|
|
1670
|
+
struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_scaled);
|
|
1671
1671
|
|
|
1672
|
-
struct
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1672
|
+
struct wsp_ggml_tensor * V =
|
|
1673
|
+
wsp_ggml_cpy(ctx0,
|
|
1674
|
+
wsp_ggml_permute(ctx0,
|
|
1675
|
+
wsp_ggml_reshape_3d(ctx0,
|
|
1676
1676
|
Vcur,
|
|
1677
1677
|
n_state/n_head, n_head, n_ctx),
|
|
1678
1678
|
1, 2, 0, 3),
|
|
1679
|
-
|
|
1679
|
+
wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
|
|
1680
1680
|
);
|
|
1681
1681
|
|
|
1682
|
-
struct
|
|
1682
|
+
struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
1683
1683
|
#endif
|
|
1684
|
-
struct
|
|
1684
|
+
struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1685
1685
|
|
|
1686
1686
|
wstate.use_buf(ctx0, 1);
|
|
1687
1687
|
|
|
1688
|
-
cur =
|
|
1688
|
+
cur = wsp_ggml_cpy(ctx0,
|
|
1689
1689
|
KQV_merged,
|
|
1690
|
-
|
|
1690
|
+
wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx));
|
|
1691
1691
|
}
|
|
1692
1692
|
|
|
1693
1693
|
// projection
|
|
1694
1694
|
{
|
|
1695
1695
|
wstate.use_buf(ctx0, 0);
|
|
1696
1696
|
|
|
1697
|
-
cur =
|
|
1697
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
1698
1698
|
layer.attn_ln_1_w,
|
|
1699
1699
|
cur);
|
|
1700
1700
|
|
|
1701
1701
|
wstate.use_buf(ctx0, 1);
|
|
1702
1702
|
|
|
1703
|
-
cur =
|
|
1704
|
-
|
|
1703
|
+
cur = wsp_ggml_add(ctx0,
|
|
1704
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
|
|
1705
1705
|
cur);
|
|
1706
1706
|
}
|
|
1707
1707
|
|
|
1708
1708
|
wstate.use_buf(ctx0, 2);
|
|
1709
1709
|
|
|
1710
1710
|
// add the input
|
|
1711
|
-
cur =
|
|
1711
|
+
cur = wsp_ggml_add(ctx0, cur, inpL);
|
|
1712
1712
|
|
|
1713
|
-
struct
|
|
1713
|
+
struct wsp_ggml_tensor * inpFF = cur;
|
|
1714
1714
|
|
|
1715
1715
|
// feed-forward network
|
|
1716
1716
|
{
|
|
@@ -1718,61 +1718,61 @@ static bool whisper_encode_internal(
|
|
|
1718
1718
|
{
|
|
1719
1719
|
wstate.use_buf(ctx0, 0);
|
|
1720
1720
|
|
|
1721
|
-
cur =
|
|
1721
|
+
cur = wsp_ggml_norm(ctx0, inpFF);
|
|
1722
1722
|
|
|
1723
1723
|
wstate.use_buf(ctx0, 1);
|
|
1724
1724
|
|
|
1725
1725
|
// cur = mlp_ln_w*cur + mlp_ln_b
|
|
1726
|
-
cur =
|
|
1727
|
-
|
|
1728
|
-
|
|
1726
|
+
cur = wsp_ggml_add(ctx0,
|
|
1727
|
+
wsp_ggml_mul(ctx0,
|
|
1728
|
+
wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
|
|
1729
1729
|
cur),
|
|
1730
|
-
|
|
1730
|
+
wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
|
|
1731
1731
|
}
|
|
1732
1732
|
|
|
1733
1733
|
#ifdef WHISPER_USE_FLASH_FF
|
|
1734
1734
|
wstate.use_buf(ctx0, 0);
|
|
1735
1735
|
|
|
1736
|
-
cur =
|
|
1737
|
-
|
|
1736
|
+
cur = wsp_ggml_flash_ff(ctx0,
|
|
1737
|
+
wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
|
|
1738
1738
|
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
|
|
1739
1739
|
#else
|
|
1740
1740
|
wstate.use_buf(ctx0, 0);
|
|
1741
1741
|
|
|
1742
1742
|
// fully connected
|
|
1743
|
-
cur =
|
|
1743
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
1744
1744
|
layer.mlp_0_w,
|
|
1745
1745
|
cur);
|
|
1746
1746
|
|
|
1747
1747
|
wstate.use_buf(ctx0, 1);
|
|
1748
1748
|
|
|
1749
|
-
cur =
|
|
1750
|
-
|
|
1749
|
+
cur = wsp_ggml_add(ctx0,
|
|
1750
|
+
wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
|
|
1751
1751
|
cur);
|
|
1752
1752
|
|
|
1753
1753
|
wstate.use_buf(ctx0, 0);
|
|
1754
1754
|
|
|
1755
1755
|
// GELU activation
|
|
1756
|
-
cur =
|
|
1756
|
+
cur = wsp_ggml_gelu(ctx0, cur);
|
|
1757
1757
|
|
|
1758
1758
|
wstate.use_buf(ctx0, 1);
|
|
1759
1759
|
|
|
1760
1760
|
// projection
|
|
1761
|
-
cur =
|
|
1761
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
1762
1762
|
layer.mlp_1_w,
|
|
1763
1763
|
cur);
|
|
1764
1764
|
|
|
1765
1765
|
wstate.use_buf(ctx0, 0);
|
|
1766
1766
|
|
|
1767
|
-
cur =
|
|
1768
|
-
|
|
1767
|
+
cur = wsp_ggml_add(ctx0,
|
|
1768
|
+
wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
|
|
1769
1769
|
cur);
|
|
1770
1770
|
#endif
|
|
1771
1771
|
}
|
|
1772
1772
|
|
|
1773
1773
|
wstate.use_buf(ctx0, 3);
|
|
1774
1774
|
|
|
1775
|
-
inpL =
|
|
1775
|
+
inpL = wsp_ggml_add(ctx0, cur, inpFF);
|
|
1776
1776
|
}
|
|
1777
1777
|
|
|
1778
1778
|
cur = inpL;
|
|
@@ -1781,36 +1781,36 @@ static bool whisper_encode_internal(
|
|
|
1781
1781
|
{
|
|
1782
1782
|
wstate.use_buf(ctx0, 0);
|
|
1783
1783
|
|
|
1784
|
-
cur =
|
|
1784
|
+
cur = wsp_ggml_norm(ctx0, cur);
|
|
1785
1785
|
|
|
1786
1786
|
wstate.use_buf(ctx0, 1);
|
|
1787
1787
|
|
|
1788
1788
|
// cur = ln_f_g*cur + ln_f_b
|
|
1789
|
-
cur =
|
|
1790
|
-
|
|
1791
|
-
|
|
1789
|
+
cur = wsp_ggml_add(ctx0,
|
|
1790
|
+
wsp_ggml_mul(ctx0,
|
|
1791
|
+
wsp_ggml_repeat(ctx0, model.e_ln_w, cur),
|
|
1792
1792
|
cur),
|
|
1793
|
-
|
|
1793
|
+
wsp_ggml_repeat(ctx0, model.e_ln_b, cur));
|
|
1794
1794
|
}
|
|
1795
1795
|
|
|
1796
1796
|
wstate.use_buf(ctx0, -1);
|
|
1797
1797
|
|
|
1798
1798
|
// run the computation
|
|
1799
1799
|
{
|
|
1800
|
-
struct
|
|
1800
|
+
struct wsp_ggml_cgraph gf = {};
|
|
1801
1801
|
gf.n_threads = n_threads;
|
|
1802
1802
|
|
|
1803
|
-
|
|
1804
|
-
|
|
1803
|
+
wsp_ggml_build_forward_expand(&gf, cur);
|
|
1804
|
+
wsp_ggml_graph_compute(ctx0, &gf);
|
|
1805
1805
|
|
|
1806
|
-
//
|
|
1806
|
+
//wsp_ggml_graph_print(&gf);
|
|
1807
1807
|
}
|
|
1808
1808
|
}
|
|
1809
1809
|
#ifdef WHISPER_USE_COREML
|
|
1810
1810
|
else if (use_coreml) {
|
|
1811
1811
|
wstate.use_buf(ctx0, -1);
|
|
1812
1812
|
|
|
1813
|
-
cur =
|
|
1813
|
+
cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
|
|
1814
1814
|
|
|
1815
1815
|
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
|
1816
1816
|
}
|
|
@@ -1819,7 +1819,7 @@ static bool whisper_encode_internal(
|
|
|
1819
1819
|
else if (use_openvino) {
|
|
1820
1820
|
wstate.use_buf(ctx0, -1);
|
|
1821
1821
|
|
|
1822
|
-
cur =
|
|
1822
|
+
cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
|
|
1823
1823
|
|
|
1824
1824
|
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
|
|
1825
1825
|
return false;
|
|
@@ -1843,11 +1843,11 @@ static bool whisper_encode_internal(
|
|
|
1843
1843
|
|
|
1844
1844
|
// pre-compute cross-attention memory
|
|
1845
1845
|
{
|
|
1846
|
-
struct
|
|
1846
|
+
struct wsp_ggml_cgraph gf = {};
|
|
1847
1847
|
gf.n_threads = n_threads;
|
|
1848
1848
|
|
|
1849
1849
|
// TODO: hack to disconnect the encoded features from the previous graph
|
|
1850
|
-
cur->op =
|
|
1850
|
+
cur->op = WSP_GGML_OP_NONE;
|
|
1851
1851
|
cur->src0 = nullptr;
|
|
1852
1852
|
cur->src1 = nullptr;
|
|
1853
1853
|
|
|
@@ -1856,53 +1856,53 @@ static bool whisper_encode_internal(
|
|
|
1856
1856
|
|
|
1857
1857
|
wstate.use_buf(ctx0, 0);
|
|
1858
1858
|
|
|
1859
|
-
struct
|
|
1859
|
+
struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0,
|
|
1860
1860
|
layer.cross_attn_k_w,
|
|
1861
1861
|
cur);
|
|
1862
1862
|
|
|
1863
|
-
Kcross =
|
|
1863
|
+
Kcross = wsp_ggml_scale_inplace(ctx0, Kcross, wsp_ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
|
|
1864
1864
|
|
|
1865
1865
|
wstate.use_buf(ctx0, 1);
|
|
1866
1866
|
|
|
1867
|
-
struct
|
|
1867
|
+
struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0,
|
|
1868
1868
|
layer.cross_attn_v_w,
|
|
1869
1869
|
cur);
|
|
1870
1870
|
|
|
1871
|
-
Vcross =
|
|
1872
|
-
|
|
1871
|
+
Vcross = wsp_ggml_add(ctx0,
|
|
1872
|
+
wsp_ggml_repeat(ctx0,
|
|
1873
1873
|
layer.cross_attn_v_b,
|
|
1874
1874
|
Vcross),
|
|
1875
1875
|
Vcross);
|
|
1876
1876
|
|
|
1877
1877
|
wstate.use_buf(ctx0, -1);
|
|
1878
1878
|
|
|
1879
|
-
Vcross =
|
|
1879
|
+
Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
|
|
1880
1880
|
|
|
1881
|
-
struct
|
|
1882
|
-
struct
|
|
1883
|
-
( n_ctx)*
|
|
1884
|
-
(il*n_ctx)*
|
|
1881
|
+
struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
|
1882
|
+
struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
|
|
1883
|
+
( n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v),
|
|
1884
|
+
(il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
|
|
1885
1885
|
|
|
1886
|
-
|
|
1887
|
-
|
|
1886
|
+
wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcross, k));
|
|
1887
|
+
wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcross, v));
|
|
1888
1888
|
}
|
|
1889
1889
|
|
|
1890
|
-
|
|
1891
|
-
//
|
|
1890
|
+
wsp_ggml_graph_compute(ctx0, &gf);
|
|
1891
|
+
//wsp_ggml_graph_print(&gf);
|
|
1892
1892
|
}
|
|
1893
1893
|
|
|
1894
1894
|
////////////////////////////////////////////////////////////////////////////
|
|
1895
1895
|
|
|
1896
1896
|
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
|
1897
|
-
//
|
|
1897
|
+
// wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
|
|
1898
1898
|
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
|
1899
1899
|
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
|
1900
1900
|
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
|
1901
1901
|
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
|
1902
1902
|
|
|
1903
|
-
|
|
1903
|
+
wsp_ggml_free(ctx0);
|
|
1904
1904
|
|
|
1905
|
-
wstate.t_encode_us +=
|
|
1905
|
+
wstate.t_encode_us += wsp_ggml_time_us() - t_start_us;
|
|
1906
1906
|
wstate.n_encode++;
|
|
1907
1907
|
|
|
1908
1908
|
return true;
|
|
@@ -1926,7 +1926,7 @@ static bool whisper_decode_internal(
|
|
|
1926
1926
|
const int n_tokens,
|
|
1927
1927
|
const int n_past,
|
|
1928
1928
|
const int n_threads) {
|
|
1929
|
-
const int64_t t_start_us =
|
|
1929
|
+
const int64_t t_start_us = wsp_ggml_time_us();
|
|
1930
1930
|
|
|
1931
1931
|
const auto & model = wctx.model;
|
|
1932
1932
|
const auto & hparams = model.hparams;
|
|
@@ -1949,21 +1949,21 @@ static bool whisper_decode_internal(
|
|
|
1949
1949
|
|
|
1950
1950
|
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
|
1951
1951
|
|
|
1952
|
-
struct
|
|
1952
|
+
struct wsp_ggml_init_params params = {
|
|
1953
1953
|
/*.mem_size =*/ wstate.buf_compute.size(),
|
|
1954
1954
|
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
|
1955
1955
|
/*.no_alloc =*/ false,
|
|
1956
1956
|
};
|
|
1957
1957
|
|
|
1958
|
-
struct
|
|
1958
|
+
struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
|
|
1959
1959
|
|
|
1960
|
-
struct
|
|
1960
|
+
struct wsp_ggml_cgraph gf = {};
|
|
1961
1961
|
gf.n_threads = n_threads;
|
|
1962
1962
|
|
|
1963
|
-
struct
|
|
1964
|
-
memcpy(embd->data, tokens, N*
|
|
1963
|
+
struct wsp_ggml_tensor * embd = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
|
|
1964
|
+
memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd));
|
|
1965
1965
|
|
|
1966
|
-
struct
|
|
1966
|
+
struct wsp_ggml_tensor * position = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
|
|
1967
1967
|
for (int i = 0; i < N; ++i) {
|
|
1968
1968
|
((int32_t *) position->data)[i] = n_past + i;
|
|
1969
1969
|
}
|
|
@@ -1971,12 +1971,12 @@ static bool whisper_decode_internal(
|
|
|
1971
1971
|
wstate.use_buf(ctx0, 3);
|
|
1972
1972
|
|
|
1973
1973
|
// token encoding + position encoding
|
|
1974
|
-
struct
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1974
|
+
struct wsp_ggml_tensor * cur =
|
|
1975
|
+
wsp_ggml_add(ctx0,
|
|
1976
|
+
wsp_ggml_get_rows(ctx0, model.d_te, embd),
|
|
1977
|
+
wsp_ggml_get_rows(ctx0, model.d_pe, position));
|
|
1978
1978
|
|
|
1979
|
-
struct
|
|
1979
|
+
struct wsp_ggml_tensor * inpL = cur;
|
|
1980
1980
|
|
|
1981
1981
|
for (int il = 0; il < n_layer; ++il) {
|
|
1982
1982
|
const auto & layer = model.layers_decoder[il];
|
|
@@ -1985,236 +1985,236 @@ static bool whisper_decode_internal(
|
|
|
1985
1985
|
{
|
|
1986
1986
|
wstate.use_buf(ctx0, 0);
|
|
1987
1987
|
|
|
1988
|
-
cur =
|
|
1988
|
+
cur = wsp_ggml_norm(ctx0, inpL);
|
|
1989
1989
|
|
|
1990
1990
|
// cur = ln_0_w*cur + ln_0_b
|
|
1991
|
-
cur =
|
|
1992
|
-
|
|
1993
|
-
|
|
1991
|
+
cur = wsp_ggml_add(ctx0,
|
|
1992
|
+
wsp_ggml_mul(ctx0,
|
|
1993
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
|
|
1994
1994
|
cur),
|
|
1995
|
-
|
|
1995
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
|
|
1996
1996
|
}
|
|
1997
1997
|
|
|
1998
1998
|
// self-attention
|
|
1999
1999
|
{
|
|
2000
|
-
struct
|
|
2000
|
+
struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
|
|
2001
2001
|
layer.attn_q_w,
|
|
2002
2002
|
cur);
|
|
2003
2003
|
|
|
2004
|
-
Qcur =
|
|
2005
|
-
|
|
2004
|
+
Qcur = wsp_ggml_add(ctx0,
|
|
2005
|
+
wsp_ggml_repeat(ctx0,
|
|
2006
2006
|
layer.attn_q_b,
|
|
2007
2007
|
Qcur),
|
|
2008
2008
|
Qcur);
|
|
2009
2009
|
|
|
2010
|
-
Qcur =
|
|
2010
|
+
Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
2011
2011
|
|
|
2012
2012
|
// note: no bias for Key
|
|
2013
|
-
struct
|
|
2013
|
+
struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
|
|
2014
2014
|
layer.attn_k_w,
|
|
2015
2015
|
cur);
|
|
2016
2016
|
|
|
2017
|
-
Kcur =
|
|
2017
|
+
Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
2018
2018
|
|
|
2019
2019
|
// store key and value to memory
|
|
2020
2020
|
{
|
|
2021
|
-
struct
|
|
2021
|
+
struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
|
|
2022
2022
|
layer.attn_v_w,
|
|
2023
2023
|
cur);
|
|
2024
2024
|
|
|
2025
|
-
Vcur =
|
|
2026
|
-
|
|
2025
|
+
Vcur = wsp_ggml_add(ctx0,
|
|
2026
|
+
wsp_ggml_repeat(ctx0,
|
|
2027
2027
|
layer.attn_v_b,
|
|
2028
2028
|
Vcur),
|
|
2029
2029
|
Vcur);
|
|
2030
2030
|
|
|
2031
|
-
Vcur =
|
|
2031
|
+
Vcur = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcur, n_state, N));
|
|
2032
2032
|
|
|
2033
|
-
struct
|
|
2034
|
-
struct
|
|
2035
|
-
( n_ctx)*
|
|
2036
|
-
(il*n_ctx)*
|
|
2033
|
+
struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, kv_self.k, N*n_state, (wsp_ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
|
2034
|
+
struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, kv_self.v, N, n_state,
|
|
2035
|
+
( n_ctx)*wsp_ggml_element_size(kv_self.v),
|
|
2036
|
+
(il*n_ctx)*wsp_ggml_element_size(kv_self.v)*n_state + n_past*wsp_ggml_element_size(kv_self.v));
|
|
2037
2037
|
|
|
2038
|
-
|
|
2039
|
-
|
|
2038
|
+
wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcur, k));
|
|
2039
|
+
wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcur, v));
|
|
2040
2040
|
}
|
|
2041
2041
|
|
|
2042
2042
|
// ------
|
|
2043
2043
|
|
|
2044
2044
|
wstate.use_buf(ctx0, 0);
|
|
2045
2045
|
|
|
2046
|
-
struct
|
|
2047
|
-
|
|
2048
|
-
|
|
2046
|
+
struct wsp_ggml_tensor * Q =
|
|
2047
|
+
wsp_ggml_permute(ctx0,
|
|
2048
|
+
wsp_ggml_cpy(ctx0,
|
|
2049
2049
|
Qcur,
|
|
2050
|
-
|
|
2050
|
+
wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
|
|
2051
2051
|
0, 2, 1, 3);
|
|
2052
2052
|
|
|
2053
|
-
struct
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2053
|
+
struct wsp_ggml_tensor * K =
|
|
2054
|
+
wsp_ggml_permute(ctx0,
|
|
2055
|
+
wsp_ggml_reshape_3d(ctx0,
|
|
2056
|
+
wsp_ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*wsp_ggml_element_size(kv_self.k)*n_state),
|
|
2057
2057
|
n_state/n_head, n_head, n_past + N),
|
|
2058
2058
|
0, 2, 1, 3);
|
|
2059
2059
|
|
|
2060
2060
|
wstate.use_buf(ctx0, 1);
|
|
2061
2061
|
|
|
2062
2062
|
// K * Q
|
|
2063
|
-
struct
|
|
2063
|
+
struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
|
|
2064
2064
|
|
|
2065
|
-
//struct
|
|
2066
|
-
//
|
|
2065
|
+
//struct wsp_ggml_tensor * KQ_scaled =
|
|
2066
|
+
// wsp_ggml_scale_inplace(ctx0,
|
|
2067
2067
|
// KQ,
|
|
2068
|
-
//
|
|
2068
|
+
// wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2069
2069
|
// );
|
|
2070
2070
|
|
|
2071
|
-
struct
|
|
2071
|
+
struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
|
|
2072
2072
|
|
|
2073
|
-
struct
|
|
2073
|
+
struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_masked);
|
|
2074
2074
|
|
|
2075
|
-
struct
|
|
2076
|
-
|
|
2075
|
+
struct wsp_ggml_tensor * V =
|
|
2076
|
+
wsp_ggml_view_3d(ctx0, kv_self.v,
|
|
2077
2077
|
n_past + N, n_state/n_head, n_head,
|
|
2078
|
-
n_ctx*
|
|
2079
|
-
n_ctx*
|
|
2080
|
-
il*n_ctx*
|
|
2078
|
+
n_ctx*wsp_ggml_element_size(kv_self.v),
|
|
2079
|
+
n_ctx*wsp_ggml_element_size(kv_self.v)*n_state/n_head,
|
|
2080
|
+
il*n_ctx*wsp_ggml_element_size(kv_self.v)*n_state);
|
|
2081
2081
|
|
|
2082
|
-
struct
|
|
2082
|
+
struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
2083
2083
|
|
|
2084
|
-
struct
|
|
2084
|
+
struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
2085
2085
|
|
|
2086
|
-
cur =
|
|
2086
|
+
cur = wsp_ggml_cpy(ctx0,
|
|
2087
2087
|
KQV_merged,
|
|
2088
|
-
|
|
2088
|
+
wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
|
|
2089
2089
|
}
|
|
2090
2090
|
|
|
2091
2091
|
// projection
|
|
2092
2092
|
{
|
|
2093
2093
|
wstate.use_buf(ctx0, 0);
|
|
2094
2094
|
|
|
2095
|
-
cur =
|
|
2095
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
2096
2096
|
layer.attn_ln_1_w,
|
|
2097
2097
|
cur);
|
|
2098
2098
|
|
|
2099
2099
|
wstate.use_buf(ctx0, 1);
|
|
2100
2100
|
|
|
2101
|
-
cur =
|
|
2102
|
-
|
|
2101
|
+
cur = wsp_ggml_add(ctx0,
|
|
2102
|
+
wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
|
|
2103
2103
|
cur);
|
|
2104
2104
|
}
|
|
2105
2105
|
|
|
2106
2106
|
wstate.use_buf(ctx0, 2);
|
|
2107
2107
|
|
|
2108
2108
|
// add the input
|
|
2109
|
-
struct
|
|
2109
|
+
struct wsp_ggml_tensor * inpCA = wsp_ggml_add(ctx0, cur, inpL);
|
|
2110
2110
|
|
|
2111
2111
|
// norm
|
|
2112
2112
|
{
|
|
2113
2113
|
wstate.use_buf(ctx0, 0);
|
|
2114
2114
|
|
|
2115
|
-
cur =
|
|
2115
|
+
cur = wsp_ggml_norm(ctx0, inpCA); // note: we use inpCA here
|
|
2116
2116
|
|
|
2117
2117
|
// cur = ln_0_w*cur + ln_0_b
|
|
2118
|
-
cur =
|
|
2119
|
-
|
|
2120
|
-
|
|
2118
|
+
cur = wsp_ggml_add(ctx0,
|
|
2119
|
+
wsp_ggml_mul(ctx0,
|
|
2120
|
+
wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
|
|
2121
2121
|
cur),
|
|
2122
|
-
|
|
2122
|
+
wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
|
|
2123
2123
|
}
|
|
2124
2124
|
|
|
2125
2125
|
// cross-attention
|
|
2126
2126
|
{
|
|
2127
|
-
struct
|
|
2127
|
+
struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
|
|
2128
2128
|
layer.cross_attn_q_w,
|
|
2129
2129
|
cur);
|
|
2130
2130
|
|
|
2131
|
-
Qcur =
|
|
2132
|
-
|
|
2131
|
+
Qcur = wsp_ggml_add(ctx0,
|
|
2132
|
+
wsp_ggml_repeat(ctx0,
|
|
2133
2133
|
layer.cross_attn_q_b,
|
|
2134
2134
|
Qcur),
|
|
2135
2135
|
Qcur);
|
|
2136
2136
|
|
|
2137
|
-
Qcur =
|
|
2137
|
+
Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
2138
2138
|
|
|
2139
2139
|
// Kcross is already scaled
|
|
2140
|
-
struct
|
|
2141
|
-
|
|
2142
|
-
|
|
2140
|
+
struct wsp_ggml_tensor * Kcross =
|
|
2141
|
+
wsp_ggml_reshape_3d(ctx0,
|
|
2142
|
+
wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.k)*n_state),
|
|
2143
2143
|
n_state/n_head, n_head, M);
|
|
2144
2144
|
|
|
2145
|
-
//struct
|
|
2146
|
-
//
|
|
2147
|
-
//
|
|
2145
|
+
//struct wsp_ggml_tensor * Vcross =
|
|
2146
|
+
// wsp_ggml_reshape_3d(ctx0,
|
|
2147
|
+
// wsp_ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state),
|
|
2148
2148
|
// n_state/n_head, n_head, M);
|
|
2149
2149
|
|
|
2150
|
-
//struct
|
|
2151
|
-
//
|
|
2152
|
-
//
|
|
2153
|
-
//
|
|
2150
|
+
//struct wsp_ggml_tensor * V_trans =
|
|
2151
|
+
// wsp_ggml_cpy(ctx0,
|
|
2152
|
+
// wsp_ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
|
|
2153
|
+
// wsp_ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
|
|
2154
2154
|
|
|
2155
|
-
struct
|
|
2156
|
-
|
|
2155
|
+
struct wsp_ggml_tensor * V =
|
|
2156
|
+
wsp_ggml_view_3d(ctx0, wstate.kv_cross.v,
|
|
2157
2157
|
M, n_state/n_head, n_head,
|
|
2158
|
-
M*
|
|
2159
|
-
M*
|
|
2160
|
-
il*M*
|
|
2158
|
+
M*wsp_ggml_element_size(wstate.kv_cross.v),
|
|
2159
|
+
M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
|
|
2160
|
+
il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
|
|
2161
2161
|
|
|
2162
2162
|
// ------
|
|
2163
2163
|
|
|
2164
|
-
struct
|
|
2165
|
-
|
|
2166
|
-
|
|
2164
|
+
struct wsp_ggml_tensor * Q =
|
|
2165
|
+
wsp_ggml_permute(ctx0,
|
|
2166
|
+
wsp_ggml_cpy(ctx0,
|
|
2167
2167
|
Qcur,
|
|
2168
|
-
|
|
2168
|
+
wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
|
|
2169
2169
|
0, 2, 1, 3);
|
|
2170
2170
|
|
|
2171
|
-
struct
|
|
2171
|
+
struct wsp_ggml_tensor * K = wsp_ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
|
|
2172
2172
|
|
|
2173
2173
|
// K * Q
|
|
2174
|
-
struct
|
|
2174
|
+
struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
|
|
2175
2175
|
|
|
2176
|
-
//struct
|
|
2177
|
-
//
|
|
2176
|
+
//struct wsp_ggml_tensor * KQ_scaled =
|
|
2177
|
+
// wsp_ggml_scale_inplace(ctx0,
|
|
2178
2178
|
// KQ,
|
|
2179
|
-
//
|
|
2179
|
+
// wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2180
2180
|
// );
|
|
2181
2181
|
|
|
2182
2182
|
// no masking for cross-attention
|
|
2183
|
-
//struct
|
|
2183
|
+
//struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
|
2184
2184
|
|
|
2185
|
-
struct
|
|
2185
|
+
struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ);
|
|
2186
2186
|
|
|
2187
|
-
struct
|
|
2187
|
+
struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
2188
2188
|
|
|
2189
|
-
struct
|
|
2189
|
+
struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
2190
2190
|
|
|
2191
2191
|
// cur = KQV_merged.contiguous().view(n_state, N)
|
|
2192
|
-
cur =
|
|
2192
|
+
cur = wsp_ggml_cpy(ctx0,
|
|
2193
2193
|
KQV_merged,
|
|
2194
|
-
|
|
2194
|
+
wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
|
|
2195
2195
|
}
|
|
2196
2196
|
|
|
2197
2197
|
// projection
|
|
2198
2198
|
{
|
|
2199
2199
|
wstate.use_buf(ctx0, 0);
|
|
2200
2200
|
|
|
2201
|
-
cur =
|
|
2201
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
2202
2202
|
layer.cross_attn_ln_1_w,
|
|
2203
2203
|
cur);
|
|
2204
2204
|
|
|
2205
2205
|
wstate.use_buf(ctx0, 1);
|
|
2206
2206
|
|
|
2207
|
-
cur =
|
|
2208
|
-
|
|
2207
|
+
cur = wsp_ggml_add(ctx0,
|
|
2208
|
+
wsp_ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
|
|
2209
2209
|
cur);
|
|
2210
2210
|
}
|
|
2211
2211
|
|
|
2212
2212
|
wstate.use_buf(ctx0, 2);
|
|
2213
2213
|
|
|
2214
2214
|
// add the input
|
|
2215
|
-
cur =
|
|
2215
|
+
cur = wsp_ggml_add(ctx0, cur, inpCA);
|
|
2216
2216
|
|
|
2217
|
-
struct
|
|
2217
|
+
struct wsp_ggml_tensor * inpFF = cur;
|
|
2218
2218
|
|
|
2219
2219
|
// feed-forward network
|
|
2220
2220
|
{
|
|
@@ -2222,53 +2222,53 @@ static bool whisper_decode_internal(
|
|
|
2222
2222
|
{
|
|
2223
2223
|
wstate.use_buf(ctx0, 0);
|
|
2224
2224
|
|
|
2225
|
-
cur =
|
|
2225
|
+
cur = wsp_ggml_norm(ctx0, inpFF);
|
|
2226
2226
|
|
|
2227
2227
|
wstate.use_buf(ctx0, 1);
|
|
2228
2228
|
|
|
2229
2229
|
// cur = mlp_ln_w*cur + mlp_ln_b
|
|
2230
|
-
cur =
|
|
2231
|
-
|
|
2232
|
-
|
|
2230
|
+
cur = wsp_ggml_add(ctx0,
|
|
2231
|
+
wsp_ggml_mul(ctx0,
|
|
2232
|
+
wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
|
|
2233
2233
|
cur),
|
|
2234
|
-
|
|
2234
|
+
wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
|
|
2235
2235
|
}
|
|
2236
2236
|
|
|
2237
2237
|
wstate.use_buf(ctx0, 0);
|
|
2238
2238
|
|
|
2239
2239
|
// fully connected
|
|
2240
|
-
cur =
|
|
2240
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
2241
2241
|
layer.mlp_0_w,
|
|
2242
2242
|
cur);
|
|
2243
2243
|
|
|
2244
2244
|
wstate.use_buf(ctx0, 1);
|
|
2245
2245
|
|
|
2246
|
-
cur =
|
|
2247
|
-
|
|
2246
|
+
cur = wsp_ggml_add(ctx0,
|
|
2247
|
+
wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
|
|
2248
2248
|
cur);
|
|
2249
2249
|
|
|
2250
2250
|
wstate.use_buf(ctx0, 0);
|
|
2251
2251
|
|
|
2252
2252
|
// GELU activation
|
|
2253
|
-
cur =
|
|
2253
|
+
cur = wsp_ggml_gelu(ctx0, cur);
|
|
2254
2254
|
|
|
2255
2255
|
wstate.use_buf(ctx0, 1);
|
|
2256
2256
|
|
|
2257
2257
|
// projection
|
|
2258
|
-
cur =
|
|
2258
|
+
cur = wsp_ggml_mul_mat(ctx0,
|
|
2259
2259
|
layer.mlp_1_w,
|
|
2260
2260
|
cur);
|
|
2261
2261
|
|
|
2262
2262
|
wstate.use_buf(ctx0, 0);
|
|
2263
2263
|
|
|
2264
|
-
cur =
|
|
2265
|
-
|
|
2264
|
+
cur = wsp_ggml_add(ctx0,
|
|
2265
|
+
wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
|
|
2266
2266
|
cur);
|
|
2267
2267
|
}
|
|
2268
2268
|
|
|
2269
2269
|
wstate.use_buf(ctx0, 3);
|
|
2270
2270
|
|
|
2271
|
-
inpL =
|
|
2271
|
+
inpL = wsp_ggml_add(ctx0, cur, inpFF);
|
|
2272
2272
|
}
|
|
2273
2273
|
|
|
2274
2274
|
cur = inpL;
|
|
@@ -2277,15 +2277,15 @@ static bool whisper_decode_internal(
|
|
|
2277
2277
|
{
|
|
2278
2278
|
wstate.use_buf(ctx0, 0);
|
|
2279
2279
|
|
|
2280
|
-
cur =
|
|
2280
|
+
cur = wsp_ggml_norm(ctx0, cur);
|
|
2281
2281
|
|
|
2282
2282
|
wstate.use_buf(ctx0, 1);
|
|
2283
2283
|
|
|
2284
|
-
cur =
|
|
2285
|
-
|
|
2286
|
-
|
|
2284
|
+
cur = wsp_ggml_add(ctx0,
|
|
2285
|
+
wsp_ggml_mul(ctx0,
|
|
2286
|
+
wsp_ggml_repeat(ctx0, model.d_ln_w, cur),
|
|
2287
2287
|
cur),
|
|
2288
|
-
|
|
2288
|
+
wsp_ggml_repeat(ctx0, model.d_ln_b, cur));
|
|
2289
2289
|
}
|
|
2290
2290
|
|
|
2291
2291
|
wstate.use_buf(ctx0, 0);
|
|
@@ -2293,38 +2293,38 @@ static bool whisper_decode_internal(
|
|
|
2293
2293
|
// compute logits only for the last token
|
|
2294
2294
|
// comment this line to compute logits for all N tokens
|
|
2295
2295
|
// might be useful in the future
|
|
2296
|
-
cur =
|
|
2296
|
+
cur = wsp_ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
|
|
2297
2297
|
|
|
2298
|
-
struct
|
|
2298
|
+
struct wsp_ggml_tensor * logits = wsp_ggml_mul_mat(ctx0, model.d_te, cur);
|
|
2299
2299
|
|
|
2300
2300
|
wstate.use_buf(ctx0, -1);
|
|
2301
2301
|
|
|
2302
2302
|
// run the computation
|
|
2303
2303
|
{
|
|
2304
|
-
|
|
2305
|
-
|
|
2304
|
+
wsp_ggml_build_forward_expand(&gf, logits);
|
|
2305
|
+
wsp_ggml_graph_compute (ctx0, &gf);
|
|
2306
2306
|
}
|
|
2307
2307
|
|
|
2308
2308
|
// extract logits for all N tokens
|
|
2309
2309
|
//logits_out.resize(N*n_vocab);
|
|
2310
|
-
//memcpy(logits_out.data(),
|
|
2310
|
+
//memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*N*n_vocab);
|
|
2311
2311
|
|
|
2312
2312
|
// extract logits only for the last token
|
|
2313
2313
|
logits_out.resize(n_vocab);
|
|
2314
|
-
memcpy(logits_out.data(),
|
|
2314
|
+
memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_vocab);
|
|
2315
2315
|
|
|
2316
2316
|
if (N > 1) {
|
|
2317
2317
|
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
|
2318
|
-
//
|
|
2318
|
+
// wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
|
|
2319
2319
|
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
|
2320
2320
|
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
|
2321
2321
|
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
|
2322
2322
|
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
|
2323
2323
|
}
|
|
2324
2324
|
|
|
2325
|
-
|
|
2325
|
+
wsp_ggml_free(ctx0);
|
|
2326
2326
|
|
|
2327
|
-
wstate.t_decode_us +=
|
|
2327
|
+
wstate.t_decode_us += wsp_ggml_time_us() - t_start_us;
|
|
2328
2328
|
wstate.n_decode++;
|
|
2329
2329
|
|
|
2330
2330
|
return true;
|
|
@@ -2502,7 +2502,7 @@ static bool log_mel_spectrogram(
|
|
|
2502
2502
|
const whisper_filters & filters,
|
|
2503
2503
|
const bool speed_up,
|
|
2504
2504
|
whisper_mel & mel) {
|
|
2505
|
-
const int64_t t_start_us =
|
|
2505
|
+
const int64_t t_start_us = wsp_ggml_time_us();
|
|
2506
2506
|
|
|
2507
2507
|
// Hanning window
|
|
2508
2508
|
std::vector<float> hann;
|
|
@@ -2574,7 +2574,7 @@ static bool log_mel_spectrogram(
|
|
|
2574
2574
|
mel.data[i] = (mel.data[i] + 4.0)/4.0;
|
|
2575
2575
|
}
|
|
2576
2576
|
|
|
2577
|
-
wstate.t_mel_us +=
|
|
2577
|
+
wstate.t_mel_us += wsp_ggml_time_us() - t_start_us;
|
|
2578
2578
|
|
|
2579
2579
|
//printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
|
|
2580
2580
|
|
|
@@ -2705,7 +2705,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
|
2705
2705
|
}
|
|
2706
2706
|
|
|
2707
2707
|
{
|
|
2708
|
-
const size_t memory_size =
|
|
2708
|
+
const size_t memory_size = wsp_ggml_nbytes(state->decoders[0].kv_self.k) + wsp_ggml_nbytes(state->decoders[0].kv_self.v);
|
|
2709
2709
|
log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
|
2710
2710
|
}
|
|
2711
2711
|
|
|
@@ -2716,7 +2716,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
|
2716
2716
|
}
|
|
2717
2717
|
|
|
2718
2718
|
{
|
|
2719
|
-
const size_t memory_size =
|
|
2719
|
+
const size_t memory_size = wsp_ggml_nbytes(state->kv_cross.k) + wsp_ggml_nbytes(state->kv_cross.v);
|
|
2720
2720
|
log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
|
2721
2721
|
}
|
|
2722
2722
|
|
|
@@ -2885,7 +2885,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
|
|
|
2885
2885
|
}
|
|
2886
2886
|
|
|
2887
2887
|
struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
|
|
2888
|
-
|
|
2888
|
+
wsp_ggml_time_init();
|
|
2889
2889
|
|
|
2890
2890
|
whisper_context * ctx = new whisper_context;
|
|
2891
2891
|
|
|
@@ -2976,7 +2976,7 @@ void whisper_free_state(struct whisper_state * state)
|
|
|
2976
2976
|
void whisper_free(struct whisper_context * ctx) {
|
|
2977
2977
|
if (ctx) {
|
|
2978
2978
|
if (ctx->model.ctx) {
|
|
2979
|
-
|
|
2979
|
+
wsp_ggml_free(ctx->model.ctx);
|
|
2980
2980
|
}
|
|
2981
2981
|
if (ctx->model.buf) {
|
|
2982
2982
|
delete ctx->model.buf;
|
|
@@ -3373,7 +3373,7 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
|
|
|
3373
3373
|
}
|
|
3374
3374
|
|
|
3375
3375
|
void whisper_print_timings(struct whisper_context * ctx) {
|
|
3376
|
-
const int64_t t_end_us =
|
|
3376
|
+
const int64_t t_end_us = wsp_ggml_time_us();
|
|
3377
3377
|
|
|
3378
3378
|
log("\n");
|
|
3379
3379
|
log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
|
@@ -3420,18 +3420,18 @@ const char * whisper_print_system_info(void) {
|
|
|
3420
3420
|
static std::string s;
|
|
3421
3421
|
|
|
3422
3422
|
s = "";
|
|
3423
|
-
s += "AVX = " + std::to_string(
|
|
3424
|
-
s += "AVX2 = " + std::to_string(
|
|
3425
|
-
s += "AVX512 = " + std::to_string(
|
|
3426
|
-
s += "FMA = " + std::to_string(
|
|
3427
|
-
s += "NEON = " + std::to_string(
|
|
3428
|
-
s += "ARM_FMA = " + std::to_string(
|
|
3429
|
-
s += "F16C = " + std::to_string(
|
|
3430
|
-
s += "FP16_VA = " + std::to_string(
|
|
3431
|
-
s += "WASM_SIMD = " + std::to_string(
|
|
3432
|
-
s += "BLAS = " + std::to_string(
|
|
3433
|
-
s += "SSE3 = " + std::to_string(
|
|
3434
|
-
s += "VSX = " + std::to_string(
|
|
3423
|
+
s += "AVX = " + std::to_string(wsp_ggml_cpu_has_avx()) + " | ";
|
|
3424
|
+
s += "AVX2 = " + std::to_string(wsp_ggml_cpu_has_avx2()) + " | ";
|
|
3425
|
+
s += "AVX512 = " + std::to_string(wsp_ggml_cpu_has_avx512()) + " | ";
|
|
3426
|
+
s += "FMA = " + std::to_string(wsp_ggml_cpu_has_fma()) + " | ";
|
|
3427
|
+
s += "NEON = " + std::to_string(wsp_ggml_cpu_has_neon()) + " | ";
|
|
3428
|
+
s += "ARM_FMA = " + std::to_string(wsp_ggml_cpu_has_arm_fma()) + " | ";
|
|
3429
|
+
s += "F16C = " + std::to_string(wsp_ggml_cpu_has_f16c()) + " | ";
|
|
3430
|
+
s += "FP16_VA = " + std::to_string(wsp_ggml_cpu_has_fp16_va()) + " | ";
|
|
3431
|
+
s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
|
|
3432
|
+
s += "BLAS = " + std::to_string(wsp_ggml_cpu_has_blas()) + " | ";
|
|
3433
|
+
s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | ";
|
|
3434
|
+
s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | ";
|
|
3435
3435
|
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
|
3436
3436
|
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
|
3437
3437
|
|
|
@@ -4314,7 +4314,7 @@ int whisper_full_with_state(
|
|
|
4314
4314
|
}
|
|
4315
4315
|
|
|
4316
4316
|
{
|
|
4317
|
-
const int64_t t_start_sample_us =
|
|
4317
|
+
const int64_t t_start_sample_us = wsp_ggml_time_us();
|
|
4318
4318
|
|
|
4319
4319
|
whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
|
|
4320
4320
|
|
|
@@ -4323,8 +4323,8 @@ int whisper_full_with_state(
|
|
|
4323
4323
|
for (int j = 1; j < n_decoders_cur; ++j) {
|
|
4324
4324
|
auto & decoder = state->decoders[j];
|
|
4325
4325
|
|
|
4326
|
-
memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data,
|
|
4327
|
-
memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data,
|
|
4326
|
+
memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, wsp_ggml_nbytes(decoder.kv_self.k));
|
|
4327
|
+
memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, wsp_ggml_nbytes(decoder.kv_self.v));
|
|
4328
4328
|
|
|
4329
4329
|
decoder.kv_self.n += prompt.size();
|
|
4330
4330
|
|
|
@@ -4333,12 +4333,12 @@ int whisper_full_with_state(
|
|
|
4333
4333
|
memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
|
|
4334
4334
|
}
|
|
4335
4335
|
|
|
4336
|
-
state->t_sample_us +=
|
|
4336
|
+
state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
|
|
4337
4337
|
}
|
|
4338
4338
|
}
|
|
4339
4339
|
|
|
4340
4340
|
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
|
4341
|
-
const int64_t t_start_sample_us =
|
|
4341
|
+
const int64_t t_start_sample_us = wsp_ggml_time_us();
|
|
4342
4342
|
|
|
4343
4343
|
// store the KV caches of all decoders when doing beam-search
|
|
4344
4344
|
if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
|
|
@@ -4350,8 +4350,8 @@ int whisper_full_with_state(
|
|
|
4350
4350
|
continue;
|
|
4351
4351
|
}
|
|
4352
4352
|
|
|
4353
|
-
kv_bufs[j].k.resize(
|
|
4354
|
-
kv_bufs[j].v.resize(
|
|
4353
|
+
kv_bufs[j].k.resize(wsp_ggml_nbytes(decoder.kv_self.k));
|
|
4354
|
+
kv_bufs[j].v.resize(wsp_ggml_nbytes(decoder.kv_self.v));
|
|
4355
4355
|
|
|
4356
4356
|
memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
|
|
4357
4357
|
memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
|
|
@@ -4531,7 +4531,7 @@ int whisper_full_with_state(
|
|
|
4531
4531
|
}
|
|
4532
4532
|
}
|
|
4533
4533
|
|
|
4534
|
-
state->t_sample_us +=
|
|
4534
|
+
state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
|
|
4535
4535
|
|
|
4536
4536
|
// obtain logits for the next token
|
|
4537
4537
|
for (int j = 0; j < n_decoders_cur; ++j) {
|
|
@@ -4552,13 +4552,13 @@ int whisper_full_with_state(
|
|
|
4552
4552
|
}
|
|
4553
4553
|
|
|
4554
4554
|
{
|
|
4555
|
-
const int64_t t_start_sample_us =
|
|
4555
|
+
const int64_t t_start_sample_us = wsp_ggml_time_us();
|
|
4556
4556
|
|
|
4557
4557
|
whisper_process_logits(*ctx, *state, params, decoder, t_cur);
|
|
4558
4558
|
|
|
4559
4559
|
++decoder.kv_self.n;
|
|
4560
4560
|
|
|
4561
|
-
state->t_sample_us +=
|
|
4561
|
+
state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
|
|
4562
4562
|
}
|
|
4563
4563
|
}
|
|
4564
4564
|
}
|
|
@@ -4980,7 +4980,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
|
4980
4980
|
s = "";
|
|
4981
4981
|
char strbuf[256];
|
|
4982
4982
|
|
|
4983
|
-
|
|
4983
|
+
wsp_ggml_time_init();
|
|
4984
4984
|
|
|
4985
4985
|
size_t n = 20;
|
|
4986
4986
|
size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
|
|
@@ -5001,11 +5001,11 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
|
5001
5001
|
double sum = 0.0;
|
|
5002
5002
|
|
|
5003
5003
|
for (size_t i = 0; i < n; i++) {
|
|
5004
|
-
const int64_t t0 =
|
|
5004
|
+
const int64_t t0 = wsp_ggml_time_us();
|
|
5005
5005
|
|
|
5006
5006
|
memcpy(dst, src, size);
|
|
5007
5007
|
|
|
5008
|
-
const int64_t t1 =
|
|
5008
|
+
const int64_t t1 = wsp_ggml_time_us();
|
|
5009
5009
|
|
|
5010
5010
|
tsum += (t1 - t0)*1e-6;
|
|
5011
5011
|
|
|
@@ -5030,17 +5030,17 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
|
5030
5030
|
return s.c_str();
|
|
5031
5031
|
}
|
|
5032
5032
|
|
|
5033
|
-
WHISPER_API int
|
|
5034
|
-
fputs(
|
|
5033
|
+
WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) {
|
|
5034
|
+
fputs(whisper_bench_wsp_ggml_mul_mat_str(n_threads), stderr);
|
|
5035
5035
|
return 0;
|
|
5036
5036
|
}
|
|
5037
5037
|
|
|
5038
|
-
WHISPER_API const char *
|
|
5038
|
+
WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
|
|
5039
5039
|
static std::string s;
|
|
5040
5040
|
s = "";
|
|
5041
5041
|
char strbuf[256];
|
|
5042
5042
|
|
|
5043
|
-
|
|
5043
|
+
wsp_ggml_time_init();
|
|
5044
5044
|
|
|
5045
5045
|
const int n_max = 128;
|
|
5046
5046
|
|
|
@@ -5080,45 +5080,45 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
5080
5080
|
const size_t N = sizes[j];
|
|
5081
5081
|
|
|
5082
5082
|
for (int k = 0; k < 7; ++k) {
|
|
5083
|
-
const
|
|
5084
|
-
k == 0 ?
|
|
5085
|
-
k == 1 ?
|
|
5086
|
-
k == 2 ?
|
|
5087
|
-
k == 3 ?
|
|
5088
|
-
k == 4 ?
|
|
5089
|
-
k == 5 ?
|
|
5083
|
+
const wsp_ggml_type wtype =
|
|
5084
|
+
k == 0 ? WSP_GGML_TYPE_Q4_0 :
|
|
5085
|
+
k == 1 ? WSP_GGML_TYPE_Q4_1 :
|
|
5086
|
+
k == 2 ? WSP_GGML_TYPE_Q5_0 :
|
|
5087
|
+
k == 3 ? WSP_GGML_TYPE_Q5_1 :
|
|
5088
|
+
k == 4 ? WSP_GGML_TYPE_Q8_0 :
|
|
5089
|
+
k == 5 ? WSP_GGML_TYPE_F16 : WSP_GGML_TYPE_F32;
|
|
5090
5090
|
|
|
5091
5091
|
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
|
|
5092
5092
|
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
|
|
5093
5093
|
|
|
5094
|
-
struct
|
|
5094
|
+
struct wsp_ggml_init_params gparams = {
|
|
5095
5095
|
/*.mem_size =*/ buf.size(),
|
|
5096
5096
|
/*.mem_buffer =*/ buf.data(),
|
|
5097
5097
|
/*.no_alloc =*/ false,
|
|
5098
5098
|
};
|
|
5099
5099
|
|
|
5100
|
-
struct
|
|
5100
|
+
struct wsp_ggml_context * ctx0 = wsp_ggml_init(gparams);
|
|
5101
5101
|
|
|
5102
|
-
struct
|
|
5103
|
-
struct
|
|
5102
|
+
struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_2d(ctx0, wtype, N, N);
|
|
5103
|
+
struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, N, N);
|
|
5104
5104
|
|
|
5105
|
-
struct
|
|
5105
|
+
struct wsp_ggml_tensor * c = wsp_ggml_mul_mat(ctx0, a, b);
|
|
5106
5106
|
|
|
5107
|
-
struct
|
|
5107
|
+
struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(c);
|
|
5108
5108
|
|
|
5109
5109
|
gf.n_threads = n_threads;
|
|
5110
5110
|
|
|
5111
5111
|
double tsum = 0.0;
|
|
5112
5112
|
|
|
5113
5113
|
// heat-up
|
|
5114
|
-
|
|
5114
|
+
wsp_ggml_graph_compute(ctx0, &gf);
|
|
5115
5115
|
|
|
5116
5116
|
for (int i = 0; i < n_max; ++i) {
|
|
5117
|
-
const int64_t t0 =
|
|
5117
|
+
const int64_t t0 = wsp_ggml_time_us();
|
|
5118
5118
|
|
|
5119
|
-
|
|
5119
|
+
wsp_ggml_graph_compute(ctx0, &gf);
|
|
5120
5120
|
|
|
5121
|
-
const int64_t t1 =
|
|
5121
|
+
const int64_t t1 = wsp_ggml_time_us();
|
|
5122
5122
|
|
|
5123
5123
|
tsum += (t1 - t0)*1e-6;
|
|
5124
5124
|
n++;
|
|
@@ -5128,7 +5128,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
5128
5128
|
}
|
|
5129
5129
|
}
|
|
5130
5130
|
|
|
5131
|
-
|
|
5131
|
+
wsp_ggml_free(ctx0);
|
|
5132
5132
|
|
|
5133
5133
|
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
|
5134
5134
|
}
|