whisper.rn 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/whisper.cpp CHANGED
@@ -28,7 +28,7 @@
28
28
  #pragma warning(disable: 4244 4267) // possible loss of data
29
29
  #endif
30
30
 
31
- #if defined(GGML_BIG_ENDIAN)
31
+ #if defined(WSP_GGML_BIG_ENDIAN)
32
32
  #include <bit>
33
33
 
34
34
  template<typename T>
@@ -42,28 +42,28 @@ float byteswap(float value) {
42
42
  }
43
43
 
44
44
  template<typename T>
45
- static void byteswap_tensor_data(ggml_tensor * tensor) {
45
+ static void byteswap_tensor_data(wsp_ggml_tensor * tensor) {
46
46
  T * datum = reinterpret_cast<T *>(tensor->data);
47
- for (int i = 0; i < ggml_nelements(tensor); i++) {
47
+ for (int i = 0; i < wsp_ggml_nelements(tensor); i++) {
48
48
  datum[i] = byteswap(datum[i]);
49
49
  }
50
50
  }
51
51
 
52
- static void byteswap_tensor(ggml_tensor * tensor) {
52
+ static void byteswap_tensor(wsp_ggml_tensor * tensor) {
53
53
  switch (tensor->type) {
54
- case GGML_TYPE_I16: {
54
+ case WSP_GGML_TYPE_I16: {
55
55
  byteswap_tensor_data<int16_t>(tensor);
56
56
  break;
57
57
  }
58
- case GGML_TYPE_F16: {
59
- byteswap_tensor_data<ggml_fp16_t>(tensor);
58
+ case WSP_GGML_TYPE_F16: {
59
+ byteswap_tensor_data<wsp_ggml_fp16_t>(tensor);
60
60
  break;
61
61
  }
62
- case GGML_TYPE_I32: {
62
+ case WSP_GGML_TYPE_I32: {
63
63
  byteswap_tensor_data<int32_t>(tensor);
64
64
  break;
65
65
  }
66
- case GGML_TYPE_F32: {
66
+ case WSP_GGML_TYPE_F32: {
67
67
  byteswap_tensor_data<float>(tensor);
68
68
  break;
69
69
  }
@@ -263,8 +263,8 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
263
263
  { MODEL_LARGE, 9ull*MB },
264
264
  };
265
265
 
266
- static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
267
- { GGML_TYPE_F32,
266
+ static const std::map<wsp_ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
267
+ { WSP_GGML_TYPE_F32,
268
268
  {
269
269
  { MODEL_TINY, 74ull*MB },
270
270
  { MODEL_BASE, 142ull*MB },
@@ -273,7 +273,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
273
273
  { MODEL_LARGE, 2952ull*MB },
274
274
  },
275
275
  },
276
- { GGML_TYPE_F16,
276
+ { WSP_GGML_TYPE_F16,
277
277
  {
278
278
  { MODEL_TINY, 74ull*MB },
279
279
  { MODEL_BASE, 142ull*MB },
@@ -282,7 +282,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
282
282
  { MODEL_LARGE, 2952ull*MB },
283
283
  },
284
284
  },
285
- { GGML_TYPE_Q4_0,
285
+ { WSP_GGML_TYPE_Q4_0,
286
286
  {
287
287
  { MODEL_TINY, 26ull*MB },
288
288
  { MODEL_BASE, 50ull*MB },
@@ -291,7 +291,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
291
291
  { MODEL_LARGE, 940ull*MB },
292
292
  },
293
293
  },
294
- { GGML_TYPE_Q4_1,
294
+ { WSP_GGML_TYPE_Q4_1,
295
295
  {
296
296
  { MODEL_TINY, 32ull*MB },
297
297
  { MODEL_BASE, 58ull*MB },
@@ -300,7 +300,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
300
300
  { MODEL_LARGE, 1124ull*MB },
301
301
  },
302
302
  },
303
- { GGML_TYPE_Q5_0,
303
+ { WSP_GGML_TYPE_Q5_0,
304
304
  {
305
305
  { MODEL_TINY, 30ull*MB },
306
306
  { MODEL_BASE, 54ull*MB },
@@ -309,7 +309,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
309
309
  { MODEL_LARGE, 1034ull*MB },
310
310
  },
311
311
  },
312
- { GGML_TYPE_Q5_1,
312
+ { WSP_GGML_TYPE_Q5_1,
313
313
  {
314
314
  { MODEL_TINY, 32ull*MB },
315
315
  { MODEL_BASE, 58ull*MB },
@@ -318,7 +318,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
318
318
  { MODEL_LARGE, 1124ull*MB },
319
319
  },
320
320
  },
321
- { GGML_TYPE_Q8_0,
321
+ { WSP_GGML_TYPE_Q8_0,
322
322
  {
323
323
  { MODEL_TINY, 45ull*MB },
324
324
  { MODEL_BASE, 84ull*MB },
@@ -446,95 +446,95 @@ struct whisper_hparams {
446
446
  // audio encoding layer
447
447
  struct whisper_layer_encoder {
448
448
  // encoder.blocks.*.attn_ln
449
- struct ggml_tensor * attn_ln_0_w;
450
- struct ggml_tensor * attn_ln_0_b;
449
+ struct wsp_ggml_tensor * attn_ln_0_w;
450
+ struct wsp_ggml_tensor * attn_ln_0_b;
451
451
 
452
452
  // encoder.blocks.*.attn.out
453
- struct ggml_tensor * attn_ln_1_w;
454
- struct ggml_tensor * attn_ln_1_b;
453
+ struct wsp_ggml_tensor * attn_ln_1_w;
454
+ struct wsp_ggml_tensor * attn_ln_1_b;
455
455
 
456
456
  // encoder.blocks.*.attn.query
457
- struct ggml_tensor * attn_q_w;
458
- struct ggml_tensor * attn_q_b;
457
+ struct wsp_ggml_tensor * attn_q_w;
458
+ struct wsp_ggml_tensor * attn_q_b;
459
459
 
460
460
  // encoder.blocks.*.attn.key
461
- struct ggml_tensor * attn_k_w;
461
+ struct wsp_ggml_tensor * attn_k_w;
462
462
 
463
463
  // encoder.blocks.*.attn.value
464
- struct ggml_tensor * attn_v_w;
465
- struct ggml_tensor * attn_v_b;
464
+ struct wsp_ggml_tensor * attn_v_w;
465
+ struct wsp_ggml_tensor * attn_v_b;
466
466
 
467
467
  // encoder.blocks.*.mlp_ln
468
- struct ggml_tensor * mlp_ln_w;
469
- struct ggml_tensor * mlp_ln_b;
468
+ struct wsp_ggml_tensor * mlp_ln_w;
469
+ struct wsp_ggml_tensor * mlp_ln_b;
470
470
 
471
471
  // encoder.blocks.*.mlp.0
472
- struct ggml_tensor * mlp_0_w;
473
- struct ggml_tensor * mlp_0_b;
472
+ struct wsp_ggml_tensor * mlp_0_w;
473
+ struct wsp_ggml_tensor * mlp_0_b;
474
474
 
475
475
  // encoder.blocks.*.mlp.2
476
- struct ggml_tensor * mlp_1_w;
477
- struct ggml_tensor * mlp_1_b;
476
+ struct wsp_ggml_tensor * mlp_1_w;
477
+ struct wsp_ggml_tensor * mlp_1_b;
478
478
  };
479
479
 
480
480
  // token decoding layer
481
481
  struct whisper_layer_decoder {
482
482
  // decoder.blocks.*.attn_ln
483
- struct ggml_tensor * attn_ln_0_w;
484
- struct ggml_tensor * attn_ln_0_b;
483
+ struct wsp_ggml_tensor * attn_ln_0_w;
484
+ struct wsp_ggml_tensor * attn_ln_0_b;
485
485
 
486
486
  // decoder.blocks.*.attn.out
487
- struct ggml_tensor * attn_ln_1_w;
488
- struct ggml_tensor * attn_ln_1_b;
487
+ struct wsp_ggml_tensor * attn_ln_1_w;
488
+ struct wsp_ggml_tensor * attn_ln_1_b;
489
489
 
490
490
  // decoder.blocks.*.attn.query
491
- struct ggml_tensor * attn_q_w;
492
- struct ggml_tensor * attn_q_b;
491
+ struct wsp_ggml_tensor * attn_q_w;
492
+ struct wsp_ggml_tensor * attn_q_b;
493
493
 
494
494
  // decoder.blocks.*.attn.key
495
- struct ggml_tensor * attn_k_w;
495
+ struct wsp_ggml_tensor * attn_k_w;
496
496
 
497
497
  // decoder.blocks.*.attn.value
498
- struct ggml_tensor * attn_v_w;
499
- struct ggml_tensor * attn_v_b;
498
+ struct wsp_ggml_tensor * attn_v_w;
499
+ struct wsp_ggml_tensor * attn_v_b;
500
500
 
501
501
  // decoder.blocks.*.cross_attn_ln
502
- struct ggml_tensor * cross_attn_ln_0_w;
503
- struct ggml_tensor * cross_attn_ln_0_b;
502
+ struct wsp_ggml_tensor * cross_attn_ln_0_w;
503
+ struct wsp_ggml_tensor * cross_attn_ln_0_b;
504
504
 
505
505
  // decoder.blocks.*.cross_attn.out
506
- struct ggml_tensor * cross_attn_ln_1_w;
507
- struct ggml_tensor * cross_attn_ln_1_b;
506
+ struct wsp_ggml_tensor * cross_attn_ln_1_w;
507
+ struct wsp_ggml_tensor * cross_attn_ln_1_b;
508
508
 
509
509
  // decoder.blocks.*.cross_attn.query
510
- struct ggml_tensor * cross_attn_q_w;
511
- struct ggml_tensor * cross_attn_q_b;
510
+ struct wsp_ggml_tensor * cross_attn_q_w;
511
+ struct wsp_ggml_tensor * cross_attn_q_b;
512
512
 
513
513
  // decoder.blocks.*.cross_attn.key
514
- struct ggml_tensor * cross_attn_k_w;
514
+ struct wsp_ggml_tensor * cross_attn_k_w;
515
515
 
516
516
  // decoder.blocks.*.cross_attn.value
517
- struct ggml_tensor * cross_attn_v_w;
518
- struct ggml_tensor * cross_attn_v_b;
517
+ struct wsp_ggml_tensor * cross_attn_v_w;
518
+ struct wsp_ggml_tensor * cross_attn_v_b;
519
519
 
520
520
  // decoder.blocks.*.mlp_ln
521
- struct ggml_tensor * mlp_ln_w;
522
- struct ggml_tensor * mlp_ln_b;
521
+ struct wsp_ggml_tensor * mlp_ln_w;
522
+ struct wsp_ggml_tensor * mlp_ln_b;
523
523
 
524
524
  // decoder.blocks.*.mlp.0
525
- struct ggml_tensor * mlp_0_w;
526
- struct ggml_tensor * mlp_0_b;
525
+ struct wsp_ggml_tensor * mlp_0_w;
526
+ struct wsp_ggml_tensor * mlp_0_b;
527
527
 
528
528
  // decoder.blocks.*.mlp.2
529
- struct ggml_tensor * mlp_1_w;
530
- struct ggml_tensor * mlp_1_b;
529
+ struct wsp_ggml_tensor * mlp_1_w;
530
+ struct wsp_ggml_tensor * mlp_1_b;
531
531
  };
532
532
 
533
533
  struct whisper_kv_cache {
534
- struct ggml_tensor * k;
535
- struct ggml_tensor * v;
534
+ struct wsp_ggml_tensor * k;
535
+ struct wsp_ggml_tensor * v;
536
536
 
537
- struct ggml_context * ctx;
537
+ struct wsp_ggml_context * ctx;
538
538
 
539
539
  std::vector<uint8_t> buf;
540
540
 
@@ -548,42 +548,42 @@ struct whisper_model {
548
548
  whisper_filters filters;
549
549
 
550
550
  // encoder.positional_embedding
551
- struct ggml_tensor * e_pe;
551
+ struct wsp_ggml_tensor * e_pe;
552
552
 
553
553
  // encoder.conv1
554
- struct ggml_tensor * e_conv_1_w;
555
- struct ggml_tensor * e_conv_1_b;
554
+ struct wsp_ggml_tensor * e_conv_1_w;
555
+ struct wsp_ggml_tensor * e_conv_1_b;
556
556
 
557
557
  // encoder.conv2
558
- struct ggml_tensor * e_conv_2_w;
559
- struct ggml_tensor * e_conv_2_b;
558
+ struct wsp_ggml_tensor * e_conv_2_w;
559
+ struct wsp_ggml_tensor * e_conv_2_b;
560
560
 
561
561
  // encoder.ln_post
562
- struct ggml_tensor * e_ln_w;
563
- struct ggml_tensor * e_ln_b;
562
+ struct wsp_ggml_tensor * e_ln_w;
563
+ struct wsp_ggml_tensor * e_ln_b;
564
564
 
565
565
  // decoder.positional_embedding
566
- struct ggml_tensor * d_pe;
566
+ struct wsp_ggml_tensor * d_pe;
567
567
 
568
568
  // decoder.token_embedding
569
- struct ggml_tensor * d_te;
569
+ struct wsp_ggml_tensor * d_te;
570
570
 
571
571
  // decoder.ln
572
- struct ggml_tensor * d_ln_w;
573
- struct ggml_tensor * d_ln_b;
572
+ struct wsp_ggml_tensor * d_ln_w;
573
+ struct wsp_ggml_tensor * d_ln_b;
574
574
 
575
575
  std::vector<whisper_layer_encoder> layers_encoder;
576
576
  std::vector<whisper_layer_decoder> layers_decoder;
577
577
 
578
578
  // context
579
- struct ggml_context * ctx;
579
+ struct wsp_ggml_context * ctx;
580
580
 
581
581
  // the model memory buffer is read-only and can be shared between processors
582
582
  std::vector<uint8_t> * buf;
583
583
 
584
584
  // tensors
585
585
  int n_loaded;
586
- std::map<std::string, struct ggml_tensor *> tensors;
586
+ std::map<std::string, struct wsp_ggml_tensor *> tensors;
587
587
  };
588
588
 
589
589
  struct whisper_sequence {
@@ -678,15 +678,15 @@ struct whisper_state {
678
678
  // [EXPERIMENTAL] speed-up techniques
679
679
  int32_t exp_n_audio_ctx = 0; // 0 - use default
680
680
 
681
- void use_buf(struct ggml_context * ctx, int i) {
681
+ void use_buf(struct wsp_ggml_context * ctx, int i) {
682
682
  #if defined(WHISPER_USE_SCRATCH)
683
683
  size_t last_size = 0;
684
684
 
685
685
  if (i == -1) {
686
- last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
686
+ last_size = wsp_ggml_set_scratch(ctx, { 0, 0, nullptr, });
687
687
  } else {
688
688
  auto & buf = buf_scratch[i];
689
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
689
+ last_size = wsp_ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
690
690
  }
691
691
 
692
692
  if (buf_last >= 0) {
@@ -714,8 +714,8 @@ struct whisper_context {
714
714
  int64_t t_load_us = 0;
715
715
  int64_t t_start_us = 0;
716
716
 
717
- ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
718
- ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
717
+ wsp_ggml_type wtype = wsp_ggml_type::WSP_GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
718
+ wsp_ggml_type itype = wsp_ggml_type::WSP_GGML_TYPE_F16; // intermediate type (FP32 or FP16)
719
719
 
720
720
  whisper_model model;
721
721
  whisper_vocab vocab;
@@ -749,17 +749,17 @@ static bool kv_cache_init(
749
749
  const struct whisper_hparams & hparams,
750
750
  const size_t mem_bytes,
751
751
  struct whisper_kv_cache & cache,
752
- ggml_type wtype,
752
+ wsp_ggml_type wtype,
753
753
  int n_ctx) {
754
754
  cache.buf.resize(mem_bytes);
755
755
 
756
- struct ggml_init_params params = {
756
+ struct wsp_ggml_init_params params = {
757
757
  /*.mem_size =*/ cache.buf.size(),
758
758
  /*.mem_buffer =*/ cache.buf.data(),
759
759
  /*.no_alloc =*/ false,
760
760
  };
761
761
 
762
- cache.ctx = ggml_init(params);
762
+ cache.ctx = wsp_ggml_init(params);
763
763
 
764
764
  if (!cache.ctx) {
765
765
  log("%s: failed to allocate memory for kv cache\n", __func__);
@@ -772,8 +772,8 @@ static bool kv_cache_init(
772
772
  const int n_mem = n_text_layer*n_ctx;
773
773
  const int n_elements = n_text_state*n_mem;
774
774
 
775
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
776
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
775
+ cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
776
+ cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
777
777
 
778
778
  return true;
779
779
  }
@@ -781,36 +781,36 @@ static bool kv_cache_init(
781
781
  static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
782
782
  WHISPER_ASSERT(cache.ctx);
783
783
 
784
- const int n_elements = ggml_nelements(cache.k);
785
- WHISPER_ASSERT(n_elements == ggml_nelements(cache.v));
784
+ const int n_elements = wsp_ggml_nelements(cache.k);
785
+ WHISPER_ASSERT(n_elements == wsp_ggml_nelements(cache.v));
786
786
 
787
- const ggml_type wtype = cache.k->type;
787
+ const wsp_ggml_type wtype = cache.k->type;
788
788
  WHISPER_ASSERT(wtype == cache.v->type);
789
789
 
790
- WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
790
+ WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*wsp_ggml_type_sizef(wtype));
791
791
 
792
- struct ggml_init_params params = {
792
+ struct wsp_ggml_init_params params = {
793
793
  /*.mem_size =*/ cache.buf.size(),
794
794
  /*.mem_buffer =*/ cache.buf.data(),
795
795
  /*.no_alloc =*/ false,
796
796
  };
797
797
 
798
- cache.ctx = ggml_init(params);
798
+ cache.ctx = wsp_ggml_init(params);
799
799
 
800
800
  if (!cache.ctx) {
801
801
  log("%s: failed to allocate memory for kv cache\n", __func__);
802
802
  return false;
803
803
  }
804
804
 
805
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
806
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
805
+ cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
806
+ cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
807
807
 
808
808
  return true;
809
809
  }
810
810
 
811
811
  static void kv_cache_free(struct whisper_kv_cache & cache) {
812
812
  if (cache.ctx) {
813
- ggml_free(cache.ctx);
813
+ wsp_ggml_free(cache.ctx);
814
814
  cache.ctx = nullptr;
815
815
  }
816
816
  }
@@ -829,7 +829,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
829
829
  static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
830
830
  log("%s: loading model\n", __func__);
831
831
 
832
- const int64_t t_start_us = ggml_time_us();
832
+ const int64_t t_start_us = wsp_ggml_time_us();
833
833
 
834
834
  wctx.t_start_us = t_start_us;
835
835
 
@@ -840,7 +840,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
840
840
  {
841
841
  uint32_t magic;
842
842
  read_safe(loader, magic);
843
- if (magic != GGML_FILE_MAGIC) {
843
+ if (magic != WSP_GGML_FILE_MAGIC) {
844
844
  log("%s: invalid model data (bad magic)\n", __func__);
845
845
  return false;
846
846
  }
@@ -884,14 +884,14 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
884
884
  model.type = e_model::MODEL_LARGE;
885
885
  }
886
886
 
887
- const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
887
+ const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR;
888
888
 
889
- hparams.ftype %= GGML_QNT_VERSION_FACTOR;
889
+ hparams.ftype %= WSP_GGML_QNT_VERSION_FACTOR;
890
890
 
891
891
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
892
892
  // in order to save memory and also to speed up the computation
893
- wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
894
- if (wctx.wtype == GGML_TYPE_COUNT) {
893
+ wctx.wtype = wsp_ggml_ftype_to_wsp_ggml_type((wsp_ggml_ftype) (model.hparams.ftype));
894
+ if (wctx.wtype == WSP_GGML_TYPE_COUNT) {
895
895
  log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
896
896
  return false;
897
897
  }
@@ -1033,8 +1033,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1033
1033
 
1034
1034
  size_t ctx_size = 0;
1035
1035
 
1036
- const ggml_type wtype = wctx.wtype;
1037
- const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
1036
+ const wsp_ggml_type wtype = wctx.wtype;
1037
+ const wsp_ggml_type vtype = wctx.wtype == WSP_GGML_TYPE_F32 ? WSP_GGML_TYPE_F32 : WSP_GGML_TYPE_F16; // conv type
1038
1038
 
1039
1039
  {
1040
1040
  const auto & hparams = model.hparams;
@@ -1053,92 +1053,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1053
1053
 
1054
1054
  // encoder
1055
1055
  {
1056
- ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
1056
+ ctx_size += n_audio_ctx*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_pe;
1057
1057
 
1058
- ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w
1059
- ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
1058
+ ctx_size += 3*n_mels*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_1_w
1059
+ ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_1_b
1060
1060
 
1061
- ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w
1062
- ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
1061
+ ctx_size += 3*n_audio_state*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_2_w
1062
+ ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_2_b
1063
1063
 
1064
- ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
1065
- ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
1064
+ ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_w;
1065
+ ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_b;
1066
1066
  }
1067
1067
 
1068
1068
  // decoder
1069
1069
  {
1070
- ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
1070
+ ctx_size += n_text_ctx*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_pe;
1071
1071
 
1072
- ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
1072
+ ctx_size += n_vocab*n_text_state*wsp_ggml_type_sizef(wtype); // d_te;
1073
1073
 
1074
- ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
1075
- ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
1074
+ ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_w;
1075
+ ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_b;
1076
1076
  }
1077
1077
 
1078
1078
  // encoder layers
1079
1079
  {
1080
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
1081
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
1080
+ ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
1081
+ ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
1082
1082
 
1083
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w
1084
- ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
1083
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w
1084
+ ctx_size += n_audio_layer*( 4*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
1085
1085
 
1086
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w
1087
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
1086
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w
1087
+ ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
1088
1088
 
1089
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
1090
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
1089
+ ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
1090
+ ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
1091
1091
 
1092
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w
1093
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
1092
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_q_w
1093
+ ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
1094
1094
 
1095
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
1095
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
1096
1096
 
1097
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w
1098
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
1097
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_v_w
1098
+ ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
1099
1099
 
1100
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w
1101
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
1100
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w
1101
+ ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
1102
1102
  }
1103
1103
 
1104
1104
  // decoder layers
1105
1105
  {
1106
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
1107
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
1106
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
1107
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
1108
1108
 
1109
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w
1110
- ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
1109
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w
1110
+ ctx_size += n_text_layer*( 4*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
1111
1111
 
1112
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w
1113
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
1112
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w
1113
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
1114
1114
 
1115
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
1116
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
1115
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
1116
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
1117
1117
 
1118
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w
1119
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
1118
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_q_w
1119
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
1120
1120
 
1121
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
1121
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
1122
1122
 
1123
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w
1124
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
1123
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_v_w
1124
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
1125
1125
 
1126
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w
1127
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
1126
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w
1127
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
1128
1128
  //
1129
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
1130
- ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
1129
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_w
1130
+ ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_b
1131
1131
 
1132
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w
1133
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
1132
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_q_w
1133
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_q_b
1134
1134
 
1135
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
1135
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_k_w
1136
1136
 
1137
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w
1138
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
1137
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_v_w
1138
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_v_b
1139
1139
 
1140
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w
1141
- ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1140
+ ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_ln_1_w
1141
+ ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_1_b
1142
1142
  }
1143
1143
 
1144
1144
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
@@ -1148,15 +1148,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1148
1148
 
1149
1149
  // create the ggml context
1150
1150
  {
1151
- struct ggml_init_params params = {
1151
+ struct wsp_ggml_init_params params = {
1152
1152
  /*.mem_size =*/ wctx.model.buf->size(),
1153
1153
  /*.mem_buffer =*/ wctx.model.buf->data(),
1154
1154
  /*.no_alloc =*/ false,
1155
1155
  };
1156
1156
 
1157
- model.ctx = ggml_init(params);
1157
+ model.ctx = wsp_ggml_init(params);
1158
1158
  if (!model.ctx) {
1159
- log("%s: ggml_init() failed\n", __func__);
1159
+ log("%s: wsp_ggml_init() failed\n", __func__);
1160
1160
  return false;
1161
1161
  }
1162
1162
  }
@@ -1184,16 +1184,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1184
1184
 
1185
1185
  // encoder
1186
1186
  {
1187
- model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
1187
+ model.e_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_audio_state, n_audio_ctx);
1188
1188
 
1189
- model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
1190
- model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1189
+ model.e_conv_1_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
1190
+ model.e_conv_1_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
1191
1191
 
1192
- model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
1193
- model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1192
+ model.e_conv_2_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
1193
+ model.e_conv_2_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
1194
1194
 
1195
- model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1196
- model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1195
+ model.e_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1196
+ model.e_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1197
1197
 
1198
1198
  // map by name
1199
1199
  model.tensors["encoder.positional_embedding"] = model.e_pe;
@@ -1210,28 +1210,28 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1210
1210
  for (int i = 0; i < n_audio_layer; ++i) {
1211
1211
  auto & layer = model.layers_encoder[i];
1212
1212
 
1213
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1214
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1213
+ layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1214
+ layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1215
1215
 
1216
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
1217
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
1216
+ layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
1217
+ layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_audio_state);
1218
1218
 
1219
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
1220
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1219
+ layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
1220
+ layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1221
1221
 
1222
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1223
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1222
+ layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1223
+ layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1224
1224
 
1225
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1226
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1225
+ layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1226
+ layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1227
1227
 
1228
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1228
+ layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1229
1229
 
1230
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1231
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1230
+ layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1231
+ layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1232
1232
 
1233
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1234
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1233
+ layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1234
+ layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
1235
1235
 
1236
1236
  // map by name
1237
1237
  model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
@@ -1261,12 +1261,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1261
1261
 
1262
1262
  // decoder
1263
1263
  {
1264
- model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
1264
+ model.d_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_text_state, n_text_ctx);
1265
1265
 
1266
- model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
1266
+ model.d_te = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
1267
1267
 
1268
- model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1269
- model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1268
+ model.d_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1269
+ model.d_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1270
1270
 
1271
1271
  // map by name
1272
1272
  model.tensors["decoder.positional_embedding"] = model.d_pe;
@@ -1279,42 +1279,42 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1279
1279
  for (int i = 0; i < n_text_layer; ++i) {
1280
1280
  auto & layer = model.layers_decoder[i];
1281
1281
 
1282
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1283
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1282
+ layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1283
+ layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1284
1284
 
1285
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
1286
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
1285
+ layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
1286
+ layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_text_state);
1287
1287
 
1288
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
1289
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1288
+ layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
1289
+ layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1290
1290
 
1291
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1292
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1291
+ layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1292
+ layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1293
1293
 
1294
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1295
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1294
+ layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1295
+ layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1296
1296
 
1297
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1297
+ layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1298
1298
 
1299
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1300
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1299
+ layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1300
+ layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1301
1301
 
1302
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1303
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1302
+ layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1303
+ layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1304
1304
 
1305
- layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1306
- layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1305
+ layer.cross_attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1306
+ layer.cross_attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1307
1307
 
1308
- layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1309
- layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1308
+ layer.cross_attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1309
+ layer.cross_attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1310
1310
 
1311
- layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1311
+ layer.cross_attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1312
1312
 
1313
- layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1314
- layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1313
+ layer.cross_attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1314
+ layer.cross_attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1315
1315
 
1316
- layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1317
- layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1316
+ layer.cross_attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1317
+ layer.cross_attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
1318
1318
 
1319
1319
  // map by name
1320
1320
  model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
@@ -1394,7 +1394,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1394
1394
  }
1395
1395
 
1396
1396
  auto tensor = model.tensors[name.data()];
1397
- if (ggml_nelements(tensor) != nelements) {
1397
+ if (wsp_ggml_nelements(tensor) != nelements) {
1398
1398
  log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
1399
1399
  log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
1400
1400
  __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
@@ -1407,19 +1407,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1407
1407
  return false;
1408
1408
  }
1409
1409
 
1410
- const size_t bpe = ggml_type_size(ggml_type(ttype));
1410
+ const size_t bpe = wsp_ggml_type_size(wsp_ggml_type(ttype));
1411
1411
 
1412
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1412
+ if ((nelements*bpe)/wsp_ggml_blck_size(tensor->type) != wsp_ggml_nbytes(tensor)) {
1413
1413
  log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1414
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1414
+ __func__, name.data(), wsp_ggml_nbytes(tensor), nelements*bpe);
1415
1415
  return false;
1416
1416
  }
1417
1417
 
1418
- loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1418
+ loader->read(loader->context, tensor->data, wsp_ggml_nbytes(tensor));
1419
1419
  BYTESWAP_TENSOR(tensor);
1420
1420
 
1421
- //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
1422
- total_size += ggml_nbytes(tensor);
1421
+ //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], wsp_ggml_type_name((wsp_ggml_type) ttype), wsp_ggml_nbytes(tensor)/1024.0/1024.0);
1422
+ total_size += wsp_ggml_nbytes(tensor);
1423
1423
  model.n_loaded++;
1424
1424
  }
1425
1425
 
@@ -1433,7 +1433,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1433
1433
  }
1434
1434
  }
1435
1435
 
1436
- wctx.t_load_us = ggml_time_us() - t_start_us;
1436
+ wctx.t_load_us = wsp_ggml_time_us() - t_start_us;
1437
1437
 
1438
1438
  return true;
1439
1439
  }
@@ -1454,7 +1454,7 @@ static bool whisper_encode_internal(
1454
1454
  const int mel_offset,
1455
1455
  const int n_threads){
1456
1456
 
1457
- const int64_t t_start_us = ggml_time_us();
1457
+ const int64_t t_start_us = wsp_ggml_time_us();
1458
1458
 
1459
1459
  const auto & model = wctx.model;
1460
1460
  const auto & mel_inp = wstate.mel;
@@ -1468,21 +1468,21 @@ static bool whisper_encode_internal(
1468
1468
  const int n_mels = hparams.n_mels;
1469
1469
  assert(mel_inp.n_mel == n_mels);
1470
1470
 
1471
- struct ggml_init_params params = {
1471
+ struct wsp_ggml_init_params params = {
1472
1472
  /*.mem_size =*/ wstate.buf_compute.size(),
1473
1473
  /*.mem_buffer =*/ wstate.buf_compute.data(),
1474
1474
  /*.no_alloc =*/ false,
1475
1475
  };
1476
1476
 
1477
- struct ggml_context * ctx0 = ggml_init(params);
1477
+ struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
1478
1478
 
1479
1479
  wstate.use_buf(ctx0, 0);
1480
1480
 
1481
- struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
1482
- assert(mel->type == GGML_TYPE_F32);
1481
+ struct wsp_ggml_tensor * mel = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, 2*n_ctx, n_mels);
1482
+ assert(mel->type == WSP_GGML_TYPE_F32);
1483
1483
  {
1484
1484
  float * dst = (float *) mel->data;
1485
- memset(dst, 0, ggml_nbytes(mel));
1485
+ memset(dst, 0, wsp_ggml_nbytes(mel));
1486
1486
 
1487
1487
  const int i0 = std::min(mel_offset, mel_inp.n_len);
1488
1488
  const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
@@ -1494,7 +1494,7 @@ static bool whisper_encode_internal(
1494
1494
  }
1495
1495
  }
1496
1496
 
1497
- struct ggml_tensor * cur;
1497
+ struct wsp_ggml_tensor * cur;
1498
1498
 
1499
1499
  #ifndef WHISPER_USE_COREML
1500
1500
  const bool use_coreml = false;
@@ -1513,25 +1513,25 @@ static bool whisper_encode_internal(
1513
1513
  {
1514
1514
  wstate.use_buf(ctx0, 1);
1515
1515
 
1516
- cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
1517
- cur = ggml_add(ctx0,
1518
- ggml_repeat(ctx0,
1516
+ cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
1517
+ cur = wsp_ggml_add(ctx0,
1518
+ wsp_ggml_repeat(ctx0,
1519
1519
  model.e_conv_1_b,
1520
1520
  cur),
1521
1521
  cur);
1522
1522
 
1523
- cur = ggml_gelu(ctx0, cur);
1523
+ cur = wsp_ggml_gelu(ctx0, cur);
1524
1524
 
1525
1525
  wstate.use_buf(ctx0, 0);
1526
1526
 
1527
- cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
1528
- cur = ggml_add(ctx0,
1529
- ggml_repeat(ctx0,
1527
+ cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
1528
+ cur = wsp_ggml_add(ctx0,
1529
+ wsp_ggml_repeat(ctx0,
1530
1530
  model.e_conv_2_b,
1531
1531
  cur),
1532
1532
  cur);
1533
1533
 
1534
- cur = ggml_gelu(ctx0, cur);
1534
+ cur = wsp_ggml_gelu(ctx0, cur);
1535
1535
  }
1536
1536
 
1537
1537
  wstate.use_buf(ctx0, 3);
@@ -1544,25 +1544,25 @@ static bool whisper_encode_internal(
1544
1544
  //iter = (iter + 1) % n_iter;
1545
1545
 
1546
1546
  //if (iter == 0) {
1547
- // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
1548
- // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
1547
+ // memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k));
1548
+ // memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v));
1549
1549
  //}
1550
1550
 
1551
1551
  static int iter = 0;
1552
1552
 
1553
- const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
1554
- const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
1553
+ const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe);
1554
+ const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter;
1555
1555
 
1556
- struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1556
+ struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1557
1557
 
1558
- cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
1558
+ cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_transpose(ctx0, cur));
1559
1559
 
1560
1560
  // ===================================================================
1561
1561
 
1562
1562
  // original:
1563
- //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
1563
+ //cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur));
1564
1564
 
1565
- struct ggml_tensor * inpL = cur;
1565
+ struct wsp_ggml_tensor * inpL = cur;
1566
1566
 
1567
1567
  for (int il = 0; il < n_layer; ++il) {
1568
1568
  const auto & layer = model.layers_encoder[il];
@@ -1571,45 +1571,45 @@ static bool whisper_encode_internal(
1571
1571
  {
1572
1572
  wstate.use_buf(ctx0, 0);
1573
1573
 
1574
- cur = ggml_norm(ctx0, inpL);
1574
+ cur = wsp_ggml_norm(ctx0, inpL);
1575
1575
 
1576
1576
  // cur = ln_0_w*cur + ln_0_b
1577
- cur = ggml_add(ctx0,
1578
- ggml_mul(ctx0,
1579
- ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1577
+ cur = wsp_ggml_add(ctx0,
1578
+ wsp_ggml_mul(ctx0,
1579
+ wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1580
1580
  cur),
1581
- ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1581
+ wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1582
1582
  }
1583
1583
 
1584
1584
  // self-attention
1585
1585
  {
1586
1586
  wstate.use_buf(ctx0, 1);
1587
1587
 
1588
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
1588
+ struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
1589
1589
  layer.attn_q_w,
1590
1590
  cur);
1591
1591
 
1592
- Qcur = ggml_add(ctx0,
1593
- ggml_repeat(ctx0,
1592
+ Qcur = wsp_ggml_add(ctx0,
1593
+ wsp_ggml_repeat(ctx0,
1594
1594
  layer.attn_q_b,
1595
1595
  Qcur),
1596
1596
  Qcur);
1597
1597
 
1598
- //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1598
+ //Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1599
1599
 
1600
1600
  // note: no bias for Key
1601
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1601
+ struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
1602
1602
  layer.attn_k_w,
1603
1603
  cur);
1604
1604
 
1605
- //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1605
+ //Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1606
1606
 
1607
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1607
+ struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
1608
1608
  layer.attn_v_w,
1609
1609
  cur);
1610
1610
 
1611
- Vcur = ggml_add(ctx0,
1612
- ggml_repeat(ctx0,
1611
+ Vcur = wsp_ggml_add(ctx0,
1612
+ wsp_ggml_repeat(ctx0,
1613
1613
  layer.attn_v_b,
1614
1614
  Vcur),
1615
1615
  Vcur);
@@ -1619,98 +1619,98 @@ static bool whisper_encode_internal(
1619
1619
  wstate.use_buf(ctx0, 0);
1620
1620
 
1621
1621
  #ifdef WHISPER_USE_FLASH_ATTN
1622
- struct ggml_tensor * Q =
1623
- ggml_permute(ctx0,
1624
- ggml_cpy(ctx0,
1622
+ struct wsp_ggml_tensor * Q =
1623
+ wsp_ggml_permute(ctx0,
1624
+ wsp_ggml_cpy(ctx0,
1625
1625
  Qcur,
1626
- ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1626
+ wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1627
1627
  0, 2, 1, 3);
1628
1628
 
1629
- struct ggml_tensor * K =
1630
- ggml_permute(ctx0,
1631
- ggml_cpy(ctx0,
1629
+ struct wsp_ggml_tensor * K =
1630
+ wsp_ggml_permute(ctx0,
1631
+ wsp_ggml_cpy(ctx0,
1632
1632
  Kcur,
1633
- ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1633
+ wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1634
1634
  0, 2, 1, 3);
1635
1635
 
1636
- struct ggml_tensor * V =
1637
- ggml_cpy(ctx0,
1638
- ggml_permute(ctx0,
1639
- ggml_reshape_3d(ctx0,
1636
+ struct wsp_ggml_tensor * V =
1637
+ wsp_ggml_cpy(ctx0,
1638
+ wsp_ggml_permute(ctx0,
1639
+ wsp_ggml_reshape_3d(ctx0,
1640
1640
  Vcur,
1641
1641
  n_state/n_head, n_head, n_ctx),
1642
1642
  1, 2, 0, 3),
1643
- ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
1643
+ wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
1644
1644
 
1645
- struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1645
+ struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false);
1646
1646
  #else
1647
- struct ggml_tensor * Q =
1648
- ggml_permute(ctx0,
1649
- ggml_cpy(ctx0,
1647
+ struct wsp_ggml_tensor * Q =
1648
+ wsp_ggml_permute(ctx0,
1649
+ wsp_ggml_cpy(ctx0,
1650
1650
  Qcur,
1651
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
1651
+ wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
1652
1652
  0, 2, 1, 3);
1653
1653
 
1654
- struct ggml_tensor * K =
1655
- ggml_permute(ctx0,
1656
- ggml_cpy(ctx0,
1654
+ struct wsp_ggml_tensor * K =
1655
+ wsp_ggml_permute(ctx0,
1656
+ wsp_ggml_cpy(ctx0,
1657
1657
  Kcur,
1658
- ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1658
+ wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1659
1659
  0, 2, 1, 3);
1660
1660
 
1661
1661
  // K * Q
1662
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1662
+ struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
1663
1663
 
1664
- struct ggml_tensor * KQ_scaled =
1665
- ggml_scale_inplace(ctx0,
1664
+ struct wsp_ggml_tensor * KQ_scaled =
1665
+ wsp_ggml_scale_inplace(ctx0,
1666
1666
  KQ,
1667
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1667
+ wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1668
1668
  );
1669
1669
 
1670
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
1670
+ struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_scaled);
1671
1671
 
1672
- struct ggml_tensor * V =
1673
- ggml_cpy(ctx0,
1674
- ggml_permute(ctx0,
1675
- ggml_reshape_3d(ctx0,
1672
+ struct wsp_ggml_tensor * V =
1673
+ wsp_ggml_cpy(ctx0,
1674
+ wsp_ggml_permute(ctx0,
1675
+ wsp_ggml_reshape_3d(ctx0,
1676
1676
  Vcur,
1677
1677
  n_state/n_head, n_head, n_ctx),
1678
1678
  1, 2, 0, 3),
1679
- ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
1679
+ wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
1680
1680
  );
1681
1681
 
1682
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1682
+ struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
1683
1683
  #endif
1684
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1684
+ struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1685
1685
 
1686
1686
  wstate.use_buf(ctx0, 1);
1687
1687
 
1688
- cur = ggml_cpy(ctx0,
1688
+ cur = wsp_ggml_cpy(ctx0,
1689
1689
  KQV_merged,
1690
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
1690
+ wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx));
1691
1691
  }
1692
1692
 
1693
1693
  // projection
1694
1694
  {
1695
1695
  wstate.use_buf(ctx0, 0);
1696
1696
 
1697
- cur = ggml_mul_mat(ctx0,
1697
+ cur = wsp_ggml_mul_mat(ctx0,
1698
1698
  layer.attn_ln_1_w,
1699
1699
  cur);
1700
1700
 
1701
1701
  wstate.use_buf(ctx0, 1);
1702
1702
 
1703
- cur = ggml_add(ctx0,
1704
- ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
1703
+ cur = wsp_ggml_add(ctx0,
1704
+ wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
1705
1705
  cur);
1706
1706
  }
1707
1707
 
1708
1708
  wstate.use_buf(ctx0, 2);
1709
1709
 
1710
1710
  // add the input
1711
- cur = ggml_add(ctx0, cur, inpL);
1711
+ cur = wsp_ggml_add(ctx0, cur, inpL);
1712
1712
 
1713
- struct ggml_tensor * inpFF = cur;
1713
+ struct wsp_ggml_tensor * inpFF = cur;
1714
1714
 
1715
1715
  // feed-forward network
1716
1716
  {
@@ -1718,61 +1718,61 @@ static bool whisper_encode_internal(
1718
1718
  {
1719
1719
  wstate.use_buf(ctx0, 0);
1720
1720
 
1721
- cur = ggml_norm(ctx0, inpFF);
1721
+ cur = wsp_ggml_norm(ctx0, inpFF);
1722
1722
 
1723
1723
  wstate.use_buf(ctx0, 1);
1724
1724
 
1725
1725
  // cur = mlp_ln_w*cur + mlp_ln_b
1726
- cur = ggml_add(ctx0,
1727
- ggml_mul(ctx0,
1728
- ggml_repeat(ctx0, layer.mlp_ln_w, cur),
1726
+ cur = wsp_ggml_add(ctx0,
1727
+ wsp_ggml_mul(ctx0,
1728
+ wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
1729
1729
  cur),
1730
- ggml_repeat(ctx0, layer.mlp_ln_b, cur));
1730
+ wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
1731
1731
  }
1732
1732
 
1733
1733
  #ifdef WHISPER_USE_FLASH_FF
1734
1734
  wstate.use_buf(ctx0, 0);
1735
1735
 
1736
- cur = ggml_flash_ff(ctx0,
1737
- ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
1736
+ cur = wsp_ggml_flash_ff(ctx0,
1737
+ wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
1738
1738
  layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
1739
1739
  #else
1740
1740
  wstate.use_buf(ctx0, 0);
1741
1741
 
1742
1742
  // fully connected
1743
- cur = ggml_mul_mat(ctx0,
1743
+ cur = wsp_ggml_mul_mat(ctx0,
1744
1744
  layer.mlp_0_w,
1745
1745
  cur);
1746
1746
 
1747
1747
  wstate.use_buf(ctx0, 1);
1748
1748
 
1749
- cur = ggml_add(ctx0,
1750
- ggml_repeat(ctx0, layer.mlp_0_b, cur),
1749
+ cur = wsp_ggml_add(ctx0,
1750
+ wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
1751
1751
  cur);
1752
1752
 
1753
1753
  wstate.use_buf(ctx0, 0);
1754
1754
 
1755
1755
  // GELU activation
1756
- cur = ggml_gelu(ctx0, cur);
1756
+ cur = wsp_ggml_gelu(ctx0, cur);
1757
1757
 
1758
1758
  wstate.use_buf(ctx0, 1);
1759
1759
 
1760
1760
  // projection
1761
- cur = ggml_mul_mat(ctx0,
1761
+ cur = wsp_ggml_mul_mat(ctx0,
1762
1762
  layer.mlp_1_w,
1763
1763
  cur);
1764
1764
 
1765
1765
  wstate.use_buf(ctx0, 0);
1766
1766
 
1767
- cur = ggml_add(ctx0,
1768
- ggml_repeat(ctx0, layer.mlp_1_b, cur),
1767
+ cur = wsp_ggml_add(ctx0,
1768
+ wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
1769
1769
  cur);
1770
1770
  #endif
1771
1771
  }
1772
1772
 
1773
1773
  wstate.use_buf(ctx0, 3);
1774
1774
 
1775
- inpL = ggml_add(ctx0, cur, inpFF);
1775
+ inpL = wsp_ggml_add(ctx0, cur, inpFF);
1776
1776
  }
1777
1777
 
1778
1778
  cur = inpL;
@@ -1781,36 +1781,36 @@ static bool whisper_encode_internal(
1781
1781
  {
1782
1782
  wstate.use_buf(ctx0, 0);
1783
1783
 
1784
- cur = ggml_norm(ctx0, cur);
1784
+ cur = wsp_ggml_norm(ctx0, cur);
1785
1785
 
1786
1786
  wstate.use_buf(ctx0, 1);
1787
1787
 
1788
1788
  // cur = ln_f_g*cur + ln_f_b
1789
- cur = ggml_add(ctx0,
1790
- ggml_mul(ctx0,
1791
- ggml_repeat(ctx0, model.e_ln_w, cur),
1789
+ cur = wsp_ggml_add(ctx0,
1790
+ wsp_ggml_mul(ctx0,
1791
+ wsp_ggml_repeat(ctx0, model.e_ln_w, cur),
1792
1792
  cur),
1793
- ggml_repeat(ctx0, model.e_ln_b, cur));
1793
+ wsp_ggml_repeat(ctx0, model.e_ln_b, cur));
1794
1794
  }
1795
1795
 
1796
1796
  wstate.use_buf(ctx0, -1);
1797
1797
 
1798
1798
  // run the computation
1799
1799
  {
1800
- struct ggml_cgraph gf = {};
1800
+ struct wsp_ggml_cgraph gf = {};
1801
1801
  gf.n_threads = n_threads;
1802
1802
 
1803
- ggml_build_forward_expand(&gf, cur);
1804
- ggml_graph_compute(ctx0, &gf);
1803
+ wsp_ggml_build_forward_expand(&gf, cur);
1804
+ wsp_ggml_graph_compute(ctx0, &gf);
1805
1805
 
1806
- //ggml_graph_print(&gf);
1806
+ //wsp_ggml_graph_print(&gf);
1807
1807
  }
1808
1808
  }
1809
1809
  #ifdef WHISPER_USE_COREML
1810
1810
  else if (use_coreml) {
1811
1811
  wstate.use_buf(ctx0, -1);
1812
1812
 
1813
- cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
1813
+ cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
1814
1814
 
1815
1815
  whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
1816
1816
  }
@@ -1819,7 +1819,7 @@ static bool whisper_encode_internal(
1819
1819
  else if (use_openvino) {
1820
1820
  wstate.use_buf(ctx0, -1);
1821
1821
 
1822
- cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
1822
+ cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
1823
1823
 
1824
1824
  if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
1825
1825
  return false;
@@ -1843,11 +1843,11 @@ static bool whisper_encode_internal(
1843
1843
 
1844
1844
  // pre-compute cross-attention memory
1845
1845
  {
1846
- struct ggml_cgraph gf = {};
1846
+ struct wsp_ggml_cgraph gf = {};
1847
1847
  gf.n_threads = n_threads;
1848
1848
 
1849
1849
  // TODO: hack to disconnect the encoded features from the previous graph
1850
- cur->op = GGML_OP_NONE;
1850
+ cur->op = WSP_GGML_OP_NONE;
1851
1851
  cur->src0 = nullptr;
1852
1852
  cur->src1 = nullptr;
1853
1853
 
@@ -1856,53 +1856,53 @@ static bool whisper_encode_internal(
1856
1856
 
1857
1857
  wstate.use_buf(ctx0, 0);
1858
1858
 
1859
- struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
1859
+ struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0,
1860
1860
  layer.cross_attn_k_w,
1861
1861
  cur);
1862
1862
 
1863
- Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1863
+ Kcross = wsp_ggml_scale_inplace(ctx0, Kcross, wsp_ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1864
1864
 
1865
1865
  wstate.use_buf(ctx0, 1);
1866
1866
 
1867
- struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
1867
+ struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0,
1868
1868
  layer.cross_attn_v_w,
1869
1869
  cur);
1870
1870
 
1871
- Vcross = ggml_add(ctx0,
1872
- ggml_repeat(ctx0,
1871
+ Vcross = wsp_ggml_add(ctx0,
1872
+ wsp_ggml_repeat(ctx0,
1873
1873
  layer.cross_attn_v_b,
1874
1874
  Vcross),
1875
1875
  Vcross);
1876
1876
 
1877
1877
  wstate.use_buf(ctx0, -1);
1878
1878
 
1879
- Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
1879
+ Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
1880
1880
 
1881
- struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
1882
- struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
1883
- ( n_ctx)*ggml_element_size(wstate.kv_cross.v),
1884
- (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
1881
+ struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
1882
+ struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
1883
+ ( n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v),
1884
+ (il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
1885
1885
 
1886
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
1887
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
1886
+ wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcross, k));
1887
+ wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcross, v));
1888
1888
  }
1889
1889
 
1890
- ggml_graph_compute(ctx0, &gf);
1891
- //ggml_graph_print(&gf);
1890
+ wsp_ggml_graph_compute(ctx0, &gf);
1891
+ //wsp_ggml_graph_print(&gf);
1892
1892
  }
1893
1893
 
1894
1894
  ////////////////////////////////////////////////////////////////////////////
1895
1895
 
1896
1896
  //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
1897
- // ggml_used_mem(ctx0)/1024.0/1024.0,
1897
+ // wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
1898
1898
  // wstate.get_buf_max_mem(0)/1024.0/1024.0,
1899
1899
  // wstate.get_buf_max_mem(1)/1024.0/1024.0,
1900
1900
  // wstate.get_buf_max_mem(2)/1024.0/1024.0,
1901
1901
  // wstate.get_buf_max_mem(3)/1024.0/1024.0);
1902
1902
 
1903
- ggml_free(ctx0);
1903
+ wsp_ggml_free(ctx0);
1904
1904
 
1905
- wstate.t_encode_us += ggml_time_us() - t_start_us;
1905
+ wstate.t_encode_us += wsp_ggml_time_us() - t_start_us;
1906
1906
  wstate.n_encode++;
1907
1907
 
1908
1908
  return true;
@@ -1926,7 +1926,7 @@ static bool whisper_decode_internal(
1926
1926
  const int n_tokens,
1927
1927
  const int n_past,
1928
1928
  const int n_threads) {
1929
- const int64_t t_start_us = ggml_time_us();
1929
+ const int64_t t_start_us = wsp_ggml_time_us();
1930
1930
 
1931
1931
  const auto & model = wctx.model;
1932
1932
  const auto & hparams = model.hparams;
@@ -1949,21 +1949,21 @@ static bool whisper_decode_internal(
1949
1949
 
1950
1950
  //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
1951
1951
 
1952
- struct ggml_init_params params = {
1952
+ struct wsp_ggml_init_params params = {
1953
1953
  /*.mem_size =*/ wstate.buf_compute.size(),
1954
1954
  /*.mem_buffer =*/ wstate.buf_compute.data(),
1955
1955
  /*.no_alloc =*/ false,
1956
1956
  };
1957
1957
 
1958
- struct ggml_context * ctx0 = ggml_init(params);
1958
+ struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
1959
1959
 
1960
- struct ggml_cgraph gf = {};
1960
+ struct wsp_ggml_cgraph gf = {};
1961
1961
  gf.n_threads = n_threads;
1962
1962
 
1963
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1964
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1963
+ struct wsp_ggml_tensor * embd = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
1964
+ memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd));
1965
1965
 
1966
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1966
+ struct wsp_ggml_tensor * position = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
1967
1967
  for (int i = 0; i < N; ++i) {
1968
1968
  ((int32_t *) position->data)[i] = n_past + i;
1969
1969
  }
@@ -1971,12 +1971,12 @@ static bool whisper_decode_internal(
1971
1971
  wstate.use_buf(ctx0, 3);
1972
1972
 
1973
1973
  // token encoding + position encoding
1974
- struct ggml_tensor * cur =
1975
- ggml_add(ctx0,
1976
- ggml_get_rows(ctx0, model.d_te, embd),
1977
- ggml_get_rows(ctx0, model.d_pe, position));
1974
+ struct wsp_ggml_tensor * cur =
1975
+ wsp_ggml_add(ctx0,
1976
+ wsp_ggml_get_rows(ctx0, model.d_te, embd),
1977
+ wsp_ggml_get_rows(ctx0, model.d_pe, position));
1978
1978
 
1979
- struct ggml_tensor * inpL = cur;
1979
+ struct wsp_ggml_tensor * inpL = cur;
1980
1980
 
1981
1981
  for (int il = 0; il < n_layer; ++il) {
1982
1982
  const auto & layer = model.layers_decoder[il];
@@ -1985,236 +1985,236 @@ static bool whisper_decode_internal(
1985
1985
  {
1986
1986
  wstate.use_buf(ctx0, 0);
1987
1987
 
1988
- cur = ggml_norm(ctx0, inpL);
1988
+ cur = wsp_ggml_norm(ctx0, inpL);
1989
1989
 
1990
1990
  // cur = ln_0_w*cur + ln_0_b
1991
- cur = ggml_add(ctx0,
1992
- ggml_mul(ctx0,
1993
- ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1991
+ cur = wsp_ggml_add(ctx0,
1992
+ wsp_ggml_mul(ctx0,
1993
+ wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1994
1994
  cur),
1995
- ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1995
+ wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1996
1996
  }
1997
1997
 
1998
1998
  // self-attention
1999
1999
  {
2000
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
2000
+ struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
2001
2001
  layer.attn_q_w,
2002
2002
  cur);
2003
2003
 
2004
- Qcur = ggml_add(ctx0,
2005
- ggml_repeat(ctx0,
2004
+ Qcur = wsp_ggml_add(ctx0,
2005
+ wsp_ggml_repeat(ctx0,
2006
2006
  layer.attn_q_b,
2007
2007
  Qcur),
2008
2008
  Qcur);
2009
2009
 
2010
- Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2010
+ Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2011
2011
 
2012
2012
  // note: no bias for Key
2013
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
2013
+ struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
2014
2014
  layer.attn_k_w,
2015
2015
  cur);
2016
2016
 
2017
- Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2017
+ Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2018
2018
 
2019
2019
  // store key and value to memory
2020
2020
  {
2021
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
2021
+ struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
2022
2022
  layer.attn_v_w,
2023
2023
  cur);
2024
2024
 
2025
- Vcur = ggml_add(ctx0,
2026
- ggml_repeat(ctx0,
2025
+ Vcur = wsp_ggml_add(ctx0,
2026
+ wsp_ggml_repeat(ctx0,
2027
2027
  layer.attn_v_b,
2028
2028
  Vcur),
2029
2029
  Vcur);
2030
2030
 
2031
- Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
2031
+ Vcur = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcur, n_state, N));
2032
2032
 
2033
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
2034
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
2035
- ( n_ctx)*ggml_element_size(kv_self.v),
2036
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
2033
+ struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, kv_self.k, N*n_state, (wsp_ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
2034
+ struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, kv_self.v, N, n_state,
2035
+ ( n_ctx)*wsp_ggml_element_size(kv_self.v),
2036
+ (il*n_ctx)*wsp_ggml_element_size(kv_self.v)*n_state + n_past*wsp_ggml_element_size(kv_self.v));
2037
2037
 
2038
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
2039
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
2038
+ wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcur, k));
2039
+ wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcur, v));
2040
2040
  }
2041
2041
 
2042
2042
  // ------
2043
2043
 
2044
2044
  wstate.use_buf(ctx0, 0);
2045
2045
 
2046
- struct ggml_tensor * Q =
2047
- ggml_permute(ctx0,
2048
- ggml_cpy(ctx0,
2046
+ struct wsp_ggml_tensor * Q =
2047
+ wsp_ggml_permute(ctx0,
2048
+ wsp_ggml_cpy(ctx0,
2049
2049
  Qcur,
2050
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
2050
+ wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
2051
2051
  0, 2, 1, 3);
2052
2052
 
2053
- struct ggml_tensor * K =
2054
- ggml_permute(ctx0,
2055
- ggml_reshape_3d(ctx0,
2056
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.k)*n_state),
2053
+ struct wsp_ggml_tensor * K =
2054
+ wsp_ggml_permute(ctx0,
2055
+ wsp_ggml_reshape_3d(ctx0,
2056
+ wsp_ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*wsp_ggml_element_size(kv_self.k)*n_state),
2057
2057
  n_state/n_head, n_head, n_past + N),
2058
2058
  0, 2, 1, 3);
2059
2059
 
2060
2060
  wstate.use_buf(ctx0, 1);
2061
2061
 
2062
2062
  // K * Q
2063
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2063
+ struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
2064
2064
 
2065
- //struct ggml_tensor * KQ_scaled =
2066
- // ggml_scale_inplace(ctx0,
2065
+ //struct wsp_ggml_tensor * KQ_scaled =
2066
+ // wsp_ggml_scale_inplace(ctx0,
2067
2067
  // KQ,
2068
- // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2068
+ // wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2069
2069
  // );
2070
2070
 
2071
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
2071
+ struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
2072
2072
 
2073
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2073
+ struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_masked);
2074
2074
 
2075
- struct ggml_tensor * V =
2076
- ggml_view_3d(ctx0, kv_self.v,
2075
+ struct wsp_ggml_tensor * V =
2076
+ wsp_ggml_view_3d(ctx0, kv_self.v,
2077
2077
  n_past + N, n_state/n_head, n_head,
2078
- n_ctx*ggml_element_size(kv_self.v),
2079
- n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
2080
- il*n_ctx*ggml_element_size(kv_self.v)*n_state);
2078
+ n_ctx*wsp_ggml_element_size(kv_self.v),
2079
+ n_ctx*wsp_ggml_element_size(kv_self.v)*n_state/n_head,
2080
+ il*n_ctx*wsp_ggml_element_size(kv_self.v)*n_state);
2081
2081
 
2082
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2082
+ struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
2083
2083
 
2084
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2084
+ struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2085
2085
 
2086
- cur = ggml_cpy(ctx0,
2086
+ cur = wsp_ggml_cpy(ctx0,
2087
2087
  KQV_merged,
2088
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
2088
+ wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
2089
2089
  }
2090
2090
 
2091
2091
  // projection
2092
2092
  {
2093
2093
  wstate.use_buf(ctx0, 0);
2094
2094
 
2095
- cur = ggml_mul_mat(ctx0,
2095
+ cur = wsp_ggml_mul_mat(ctx0,
2096
2096
  layer.attn_ln_1_w,
2097
2097
  cur);
2098
2098
 
2099
2099
  wstate.use_buf(ctx0, 1);
2100
2100
 
2101
- cur = ggml_add(ctx0,
2102
- ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
2101
+ cur = wsp_ggml_add(ctx0,
2102
+ wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
2103
2103
  cur);
2104
2104
  }
2105
2105
 
2106
2106
  wstate.use_buf(ctx0, 2);
2107
2107
 
2108
2108
  // add the input
2109
- struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
2109
+ struct wsp_ggml_tensor * inpCA = wsp_ggml_add(ctx0, cur, inpL);
2110
2110
 
2111
2111
  // norm
2112
2112
  {
2113
2113
  wstate.use_buf(ctx0, 0);
2114
2114
 
2115
- cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
2115
+ cur = wsp_ggml_norm(ctx0, inpCA); // note: we use inpCA here
2116
2116
 
2117
2117
  // cur = ln_0_w*cur + ln_0_b
2118
- cur = ggml_add(ctx0,
2119
- ggml_mul(ctx0,
2120
- ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
2118
+ cur = wsp_ggml_add(ctx0,
2119
+ wsp_ggml_mul(ctx0,
2120
+ wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
2121
2121
  cur),
2122
- ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
2122
+ wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
2123
2123
  }
2124
2124
 
2125
2125
  // cross-attention
2126
2126
  {
2127
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
2127
+ struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
2128
2128
  layer.cross_attn_q_w,
2129
2129
  cur);
2130
2130
 
2131
- Qcur = ggml_add(ctx0,
2132
- ggml_repeat(ctx0,
2131
+ Qcur = wsp_ggml_add(ctx0,
2132
+ wsp_ggml_repeat(ctx0,
2133
2133
  layer.cross_attn_q_b,
2134
2134
  Qcur),
2135
2135
  Qcur);
2136
2136
 
2137
- Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2137
+ Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2138
2138
 
2139
2139
  // Kcross is already scaled
2140
- struct ggml_tensor * Kcross =
2141
- ggml_reshape_3d(ctx0,
2142
- ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
2140
+ struct wsp_ggml_tensor * Kcross =
2141
+ wsp_ggml_reshape_3d(ctx0,
2142
+ wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.k)*n_state),
2143
2143
  n_state/n_head, n_head, M);
2144
2144
 
2145
- //struct ggml_tensor * Vcross =
2146
- // ggml_reshape_3d(ctx0,
2147
- // ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
2145
+ //struct wsp_ggml_tensor * Vcross =
2146
+ // wsp_ggml_reshape_3d(ctx0,
2147
+ // wsp_ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state),
2148
2148
  // n_state/n_head, n_head, M);
2149
2149
 
2150
- //struct ggml_tensor * V_trans =
2151
- // ggml_cpy(ctx0,
2152
- // ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
2153
- // ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
2150
+ //struct wsp_ggml_tensor * V_trans =
2151
+ // wsp_ggml_cpy(ctx0,
2152
+ // wsp_ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
2153
+ // wsp_ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
2154
2154
 
2155
- struct ggml_tensor * V =
2156
- ggml_view_3d(ctx0, wstate.kv_cross.v,
2155
+ struct wsp_ggml_tensor * V =
2156
+ wsp_ggml_view_3d(ctx0, wstate.kv_cross.v,
2157
2157
  M, n_state/n_head, n_head,
2158
- M*ggml_element_size(wstate.kv_cross.v),
2159
- M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
2160
- il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
2158
+ M*wsp_ggml_element_size(wstate.kv_cross.v),
2159
+ M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
2160
+ il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
2161
2161
 
2162
2162
  // ------
2163
2163
 
2164
- struct ggml_tensor * Q =
2165
- ggml_permute(ctx0,
2166
- ggml_cpy(ctx0,
2164
+ struct wsp_ggml_tensor * Q =
2165
+ wsp_ggml_permute(ctx0,
2166
+ wsp_ggml_cpy(ctx0,
2167
2167
  Qcur,
2168
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
2168
+ wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
2169
2169
  0, 2, 1, 3);
2170
2170
 
2171
- struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
2171
+ struct wsp_ggml_tensor * K = wsp_ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
2172
2172
 
2173
2173
  // K * Q
2174
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2174
+ struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
2175
2175
 
2176
- //struct ggml_tensor * KQ_scaled =
2177
- // ggml_scale_inplace(ctx0,
2176
+ //struct wsp_ggml_tensor * KQ_scaled =
2177
+ // wsp_ggml_scale_inplace(ctx0,
2178
2178
  // KQ,
2179
- // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2179
+ // wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2180
2180
  // );
2181
2181
 
2182
2182
  // no masking for cross-attention
2183
- //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2183
+ //struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2184
2184
 
2185
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
2185
+ struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ);
2186
2186
 
2187
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2187
+ struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
2188
2188
 
2189
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2189
+ struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2190
2190
 
2191
2191
  // cur = KQV_merged.contiguous().view(n_state, N)
2192
- cur = ggml_cpy(ctx0,
2192
+ cur = wsp_ggml_cpy(ctx0,
2193
2193
  KQV_merged,
2194
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
2194
+ wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
2195
2195
  }
2196
2196
 
2197
2197
  // projection
2198
2198
  {
2199
2199
  wstate.use_buf(ctx0, 0);
2200
2200
 
2201
- cur = ggml_mul_mat(ctx0,
2201
+ cur = wsp_ggml_mul_mat(ctx0,
2202
2202
  layer.cross_attn_ln_1_w,
2203
2203
  cur);
2204
2204
 
2205
2205
  wstate.use_buf(ctx0, 1);
2206
2206
 
2207
- cur = ggml_add(ctx0,
2208
- ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
2207
+ cur = wsp_ggml_add(ctx0,
2208
+ wsp_ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
2209
2209
  cur);
2210
2210
  }
2211
2211
 
2212
2212
  wstate.use_buf(ctx0, 2);
2213
2213
 
2214
2214
  // add the input
2215
- cur = ggml_add(ctx0, cur, inpCA);
2215
+ cur = wsp_ggml_add(ctx0, cur, inpCA);
2216
2216
 
2217
- struct ggml_tensor * inpFF = cur;
2217
+ struct wsp_ggml_tensor * inpFF = cur;
2218
2218
 
2219
2219
  // feed-forward network
2220
2220
  {
@@ -2222,53 +2222,53 @@ static bool whisper_decode_internal(
2222
2222
  {
2223
2223
  wstate.use_buf(ctx0, 0);
2224
2224
 
2225
- cur = ggml_norm(ctx0, inpFF);
2225
+ cur = wsp_ggml_norm(ctx0, inpFF);
2226
2226
 
2227
2227
  wstate.use_buf(ctx0, 1);
2228
2228
 
2229
2229
  // cur = mlp_ln_w*cur + mlp_ln_b
2230
- cur = ggml_add(ctx0,
2231
- ggml_mul(ctx0,
2232
- ggml_repeat(ctx0, layer.mlp_ln_w, cur),
2230
+ cur = wsp_ggml_add(ctx0,
2231
+ wsp_ggml_mul(ctx0,
2232
+ wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
2233
2233
  cur),
2234
- ggml_repeat(ctx0, layer.mlp_ln_b, cur));
2234
+ wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
2235
2235
  }
2236
2236
 
2237
2237
  wstate.use_buf(ctx0, 0);
2238
2238
 
2239
2239
  // fully connected
2240
- cur = ggml_mul_mat(ctx0,
2240
+ cur = wsp_ggml_mul_mat(ctx0,
2241
2241
  layer.mlp_0_w,
2242
2242
  cur);
2243
2243
 
2244
2244
  wstate.use_buf(ctx0, 1);
2245
2245
 
2246
- cur = ggml_add(ctx0,
2247
- ggml_repeat(ctx0, layer.mlp_0_b, cur),
2246
+ cur = wsp_ggml_add(ctx0,
2247
+ wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
2248
2248
  cur);
2249
2249
 
2250
2250
  wstate.use_buf(ctx0, 0);
2251
2251
 
2252
2252
  // GELU activation
2253
- cur = ggml_gelu(ctx0, cur);
2253
+ cur = wsp_ggml_gelu(ctx0, cur);
2254
2254
 
2255
2255
  wstate.use_buf(ctx0, 1);
2256
2256
 
2257
2257
  // projection
2258
- cur = ggml_mul_mat(ctx0,
2258
+ cur = wsp_ggml_mul_mat(ctx0,
2259
2259
  layer.mlp_1_w,
2260
2260
  cur);
2261
2261
 
2262
2262
  wstate.use_buf(ctx0, 0);
2263
2263
 
2264
- cur = ggml_add(ctx0,
2265
- ggml_repeat(ctx0, layer.mlp_1_b, cur),
2264
+ cur = wsp_ggml_add(ctx0,
2265
+ wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
2266
2266
  cur);
2267
2267
  }
2268
2268
 
2269
2269
  wstate.use_buf(ctx0, 3);
2270
2270
 
2271
- inpL = ggml_add(ctx0, cur, inpFF);
2271
+ inpL = wsp_ggml_add(ctx0, cur, inpFF);
2272
2272
  }
2273
2273
 
2274
2274
  cur = inpL;
@@ -2277,15 +2277,15 @@ static bool whisper_decode_internal(
2277
2277
  {
2278
2278
  wstate.use_buf(ctx0, 0);
2279
2279
 
2280
- cur = ggml_norm(ctx0, cur);
2280
+ cur = wsp_ggml_norm(ctx0, cur);
2281
2281
 
2282
2282
  wstate.use_buf(ctx0, 1);
2283
2283
 
2284
- cur = ggml_add(ctx0,
2285
- ggml_mul(ctx0,
2286
- ggml_repeat(ctx0, model.d_ln_w, cur),
2284
+ cur = wsp_ggml_add(ctx0,
2285
+ wsp_ggml_mul(ctx0,
2286
+ wsp_ggml_repeat(ctx0, model.d_ln_w, cur),
2287
2287
  cur),
2288
- ggml_repeat(ctx0, model.d_ln_b, cur));
2288
+ wsp_ggml_repeat(ctx0, model.d_ln_b, cur));
2289
2289
  }
2290
2290
 
2291
2291
  wstate.use_buf(ctx0, 0);
@@ -2293,38 +2293,38 @@ static bool whisper_decode_internal(
2293
2293
  // compute logits only for the last token
2294
2294
  // comment this line to compute logits for all N tokens
2295
2295
  // might be useful in the future
2296
- cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
2296
+ cur = wsp_ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
2297
2297
 
2298
- struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
2298
+ struct wsp_ggml_tensor * logits = wsp_ggml_mul_mat(ctx0, model.d_te, cur);
2299
2299
 
2300
2300
  wstate.use_buf(ctx0, -1);
2301
2301
 
2302
2302
  // run the computation
2303
2303
  {
2304
- ggml_build_forward_expand(&gf, logits);
2305
- ggml_graph_compute (ctx0, &gf);
2304
+ wsp_ggml_build_forward_expand(&gf, logits);
2305
+ wsp_ggml_graph_compute (ctx0, &gf);
2306
2306
  }
2307
2307
 
2308
2308
  // extract logits for all N tokens
2309
2309
  //logits_out.resize(N*n_vocab);
2310
- //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
2310
+ //memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*N*n_vocab);
2311
2311
 
2312
2312
  // extract logits only for the last token
2313
2313
  logits_out.resize(n_vocab);
2314
- memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab);
2314
+ memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_vocab);
2315
2315
 
2316
2316
  if (N > 1) {
2317
2317
  //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
2318
- // ggml_used_mem(ctx0)/1024.0/1024.0,
2318
+ // wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
2319
2319
  // wstate.get_buf_max_mem(0)/1024.0/1024.0,
2320
2320
  // wstate.get_buf_max_mem(1)/1024.0/1024.0,
2321
2321
  // wstate.get_buf_max_mem(2)/1024.0/1024.0,
2322
2322
  // wstate.get_buf_max_mem(3)/1024.0/1024.0);
2323
2323
  }
2324
2324
 
2325
- ggml_free(ctx0);
2325
+ wsp_ggml_free(ctx0);
2326
2326
 
2327
- wstate.t_decode_us += ggml_time_us() - t_start_us;
2327
+ wstate.t_decode_us += wsp_ggml_time_us() - t_start_us;
2328
2328
  wstate.n_decode++;
2329
2329
 
2330
2330
  return true;
@@ -2502,7 +2502,7 @@ static bool log_mel_spectrogram(
2502
2502
  const whisper_filters & filters,
2503
2503
  const bool speed_up,
2504
2504
  whisper_mel & mel) {
2505
- const int64_t t_start_us = ggml_time_us();
2505
+ const int64_t t_start_us = wsp_ggml_time_us();
2506
2506
 
2507
2507
  // Hanning window
2508
2508
  std::vector<float> hann;
@@ -2574,7 +2574,7 @@ static bool log_mel_spectrogram(
2574
2574
  mel.data[i] = (mel.data[i] + 4.0)/4.0;
2575
2575
  }
2576
2576
 
2577
- wstate.t_mel_us += ggml_time_us() - t_start_us;
2577
+ wstate.t_mel_us += wsp_ggml_time_us() - t_start_us;
2578
2578
 
2579
2579
  //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
2580
2580
 
@@ -2705,7 +2705,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2705
2705
  }
2706
2706
 
2707
2707
  {
2708
- const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
2708
+ const size_t memory_size = wsp_ggml_nbytes(state->decoders[0].kv_self.k) + wsp_ggml_nbytes(state->decoders[0].kv_self.v);
2709
2709
  log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2710
2710
  }
2711
2711
 
@@ -2716,7 +2716,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2716
2716
  }
2717
2717
 
2718
2718
  {
2719
- const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
2719
+ const size_t memory_size = wsp_ggml_nbytes(state->kv_cross.k) + wsp_ggml_nbytes(state->kv_cross.v);
2720
2720
  log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2721
2721
  }
2722
2722
 
@@ -2885,7 +2885,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
2885
2885
  }
2886
2886
 
2887
2887
  struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
2888
- ggml_time_init();
2888
+ wsp_ggml_time_init();
2889
2889
 
2890
2890
  whisper_context * ctx = new whisper_context;
2891
2891
 
@@ -2976,7 +2976,7 @@ void whisper_free_state(struct whisper_state * state)
2976
2976
  void whisper_free(struct whisper_context * ctx) {
2977
2977
  if (ctx) {
2978
2978
  if (ctx->model.ctx) {
2979
- ggml_free(ctx->model.ctx);
2979
+ wsp_ggml_free(ctx->model.ctx);
2980
2980
  }
2981
2981
  if (ctx->model.buf) {
2982
2982
  delete ctx->model.buf;
@@ -3373,7 +3373,7 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
3373
3373
  }
3374
3374
 
3375
3375
  void whisper_print_timings(struct whisper_context * ctx) {
3376
- const int64_t t_end_us = ggml_time_us();
3376
+ const int64_t t_end_us = wsp_ggml_time_us();
3377
3377
 
3378
3378
  log("\n");
3379
3379
  log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
@@ -3420,18 +3420,18 @@ const char * whisper_print_system_info(void) {
3420
3420
  static std::string s;
3421
3421
 
3422
3422
  s = "";
3423
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
3424
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
3425
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
3426
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
3427
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
3428
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
3429
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
3430
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
3431
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
3432
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
3433
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
3434
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
3423
+ s += "AVX = " + std::to_string(wsp_ggml_cpu_has_avx()) + " | ";
3424
+ s += "AVX2 = " + std::to_string(wsp_ggml_cpu_has_avx2()) + " | ";
3425
+ s += "AVX512 = " + std::to_string(wsp_ggml_cpu_has_avx512()) + " | ";
3426
+ s += "FMA = " + std::to_string(wsp_ggml_cpu_has_fma()) + " | ";
3427
+ s += "NEON = " + std::to_string(wsp_ggml_cpu_has_neon()) + " | ";
3428
+ s += "ARM_FMA = " + std::to_string(wsp_ggml_cpu_has_arm_fma()) + " | ";
3429
+ s += "F16C = " + std::to_string(wsp_ggml_cpu_has_f16c()) + " | ";
3430
+ s += "FP16_VA = " + std::to_string(wsp_ggml_cpu_has_fp16_va()) + " | ";
3431
+ s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
3432
+ s += "BLAS = " + std::to_string(wsp_ggml_cpu_has_blas()) + " | ";
3433
+ s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | ";
3434
+ s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | ";
3435
3435
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
3436
3436
  s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
3437
3437
 
@@ -4314,7 +4314,7 @@ int whisper_full_with_state(
4314
4314
  }
4315
4315
 
4316
4316
  {
4317
- const int64_t t_start_sample_us = ggml_time_us();
4317
+ const int64_t t_start_sample_us = wsp_ggml_time_us();
4318
4318
 
4319
4319
  whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
4320
4320
 
@@ -4323,8 +4323,8 @@ int whisper_full_with_state(
4323
4323
  for (int j = 1; j < n_decoders_cur; ++j) {
4324
4324
  auto & decoder = state->decoders[j];
4325
4325
 
4326
- memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
4327
- memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
4326
+ memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, wsp_ggml_nbytes(decoder.kv_self.k));
4327
+ memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, wsp_ggml_nbytes(decoder.kv_self.v));
4328
4328
 
4329
4329
  decoder.kv_self.n += prompt.size();
4330
4330
 
@@ -4333,12 +4333,12 @@ int whisper_full_with_state(
4333
4333
  memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
4334
4334
  }
4335
4335
 
4336
- state->t_sample_us += ggml_time_us() - t_start_sample_us;
4336
+ state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
4337
4337
  }
4338
4338
  }
4339
4339
 
4340
4340
  for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
4341
- const int64_t t_start_sample_us = ggml_time_us();
4341
+ const int64_t t_start_sample_us = wsp_ggml_time_us();
4342
4342
 
4343
4343
  // store the KV caches of all decoders when doing beam-search
4344
4344
  if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@@ -4350,8 +4350,8 @@ int whisper_full_with_state(
4350
4350
  continue;
4351
4351
  }
4352
4352
 
4353
- kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
4354
- kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
4353
+ kv_bufs[j].k.resize(wsp_ggml_nbytes(decoder.kv_self.k));
4354
+ kv_bufs[j].v.resize(wsp_ggml_nbytes(decoder.kv_self.v));
4355
4355
 
4356
4356
  memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
4357
4357
  memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
@@ -4531,7 +4531,7 @@ int whisper_full_with_state(
4531
4531
  }
4532
4532
  }
4533
4533
 
4534
- state->t_sample_us += ggml_time_us() - t_start_sample_us;
4534
+ state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
4535
4535
 
4536
4536
  // obtain logits for the next token
4537
4537
  for (int j = 0; j < n_decoders_cur; ++j) {
@@ -4552,13 +4552,13 @@ int whisper_full_with_state(
4552
4552
  }
4553
4553
 
4554
4554
  {
4555
- const int64_t t_start_sample_us = ggml_time_us();
4555
+ const int64_t t_start_sample_us = wsp_ggml_time_us();
4556
4556
 
4557
4557
  whisper_process_logits(*ctx, *state, params, decoder, t_cur);
4558
4558
 
4559
4559
  ++decoder.kv_self.n;
4560
4560
 
4561
- state->t_sample_us += ggml_time_us() - t_start_sample_us;
4561
+ state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
4562
4562
  }
4563
4563
  }
4564
4564
  }
@@ -4980,7 +4980,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
4980
4980
  s = "";
4981
4981
  char strbuf[256];
4982
4982
 
4983
- ggml_time_init();
4983
+ wsp_ggml_time_init();
4984
4984
 
4985
4985
  size_t n = 20;
4986
4986
  size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
@@ -5001,11 +5001,11 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
5001
5001
  double sum = 0.0;
5002
5002
 
5003
5003
  for (size_t i = 0; i < n; i++) {
5004
- const int64_t t0 = ggml_time_us();
5004
+ const int64_t t0 = wsp_ggml_time_us();
5005
5005
 
5006
5006
  memcpy(dst, src, size);
5007
5007
 
5008
- const int64_t t1 = ggml_time_us();
5008
+ const int64_t t1 = wsp_ggml_time_us();
5009
5009
 
5010
5010
  tsum += (t1 - t0)*1e-6;
5011
5011
 
@@ -5030,17 +5030,17 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
5030
5030
  return s.c_str();
5031
5031
  }
5032
5032
 
5033
- WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
5034
- fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
5033
+ WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) {
5034
+ fputs(whisper_bench_wsp_ggml_mul_mat_str(n_threads), stderr);
5035
5035
  return 0;
5036
5036
  }
5037
5037
 
5038
- WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
5038
+ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
5039
5039
  static std::string s;
5040
5040
  s = "";
5041
5041
  char strbuf[256];
5042
5042
 
5043
- ggml_time_init();
5043
+ wsp_ggml_time_init();
5044
5044
 
5045
5045
  const int n_max = 128;
5046
5046
 
@@ -5080,45 +5080,45 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
5080
5080
  const size_t N = sizes[j];
5081
5081
 
5082
5082
  for (int k = 0; k < 7; ++k) {
5083
- const ggml_type wtype =
5084
- k == 0 ? GGML_TYPE_Q4_0 :
5085
- k == 1 ? GGML_TYPE_Q4_1 :
5086
- k == 2 ? GGML_TYPE_Q5_0 :
5087
- k == 3 ? GGML_TYPE_Q5_1 :
5088
- k == 4 ? GGML_TYPE_Q8_0 :
5089
- k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
5083
+ const wsp_ggml_type wtype =
5084
+ k == 0 ? WSP_GGML_TYPE_Q4_0 :
5085
+ k == 1 ? WSP_GGML_TYPE_Q4_1 :
5086
+ k == 2 ? WSP_GGML_TYPE_Q5_0 :
5087
+ k == 3 ? WSP_GGML_TYPE_Q5_1 :
5088
+ k == 4 ? WSP_GGML_TYPE_Q8_0 :
5089
+ k == 5 ? WSP_GGML_TYPE_F16 : WSP_GGML_TYPE_F32;
5090
5090
 
5091
5091
  double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
5092
5092
  int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
5093
5093
 
5094
- struct ggml_init_params gparams = {
5094
+ struct wsp_ggml_init_params gparams = {
5095
5095
  /*.mem_size =*/ buf.size(),
5096
5096
  /*.mem_buffer =*/ buf.data(),
5097
5097
  /*.no_alloc =*/ false,
5098
5098
  };
5099
5099
 
5100
- struct ggml_context * ctx0 = ggml_init(gparams);
5100
+ struct wsp_ggml_context * ctx0 = wsp_ggml_init(gparams);
5101
5101
 
5102
- struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
5103
- struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
5102
+ struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_2d(ctx0, wtype, N, N);
5103
+ struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, N, N);
5104
5104
 
5105
- struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
5105
+ struct wsp_ggml_tensor * c = wsp_ggml_mul_mat(ctx0, a, b);
5106
5106
 
5107
- struct ggml_cgraph gf = ggml_build_forward(c);
5107
+ struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(c);
5108
5108
 
5109
5109
  gf.n_threads = n_threads;
5110
5110
 
5111
5111
  double tsum = 0.0;
5112
5112
 
5113
5113
  // heat-up
5114
- ggml_graph_compute(ctx0, &gf);
5114
+ wsp_ggml_graph_compute(ctx0, &gf);
5115
5115
 
5116
5116
  for (int i = 0; i < n_max; ++i) {
5117
- const int64_t t0 = ggml_time_us();
5117
+ const int64_t t0 = wsp_ggml_time_us();
5118
5118
 
5119
- ggml_graph_compute(ctx0, &gf);
5119
+ wsp_ggml_graph_compute(ctx0, &gf);
5120
5120
 
5121
- const int64_t t1 = ggml_time_us();
5121
+ const int64_t t1 = wsp_ggml_time_us();
5122
5122
 
5123
5123
  tsum += (t1 - t0)*1e-6;
5124
5124
  n++;
@@ -5128,7 +5128,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
5128
5128
  }
5129
5129
  }
5130
5130
 
5131
- ggml_free(ctx0);
5131
+ wsp_ggml_free(ctx0);
5132
5132
 
5133
5133
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
5134
5134
  }