whisper.rn 0.3.0-rc.1 → 0.3.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml.h CHANGED
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,10 +305,15 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
298
317
  GGML_OP_CONV_1D_1S,
299
318
  GGML_OP_CONV_1D_2S,
300
319
 
@@ -322,7 +341,8 @@ extern "C" {
322
341
 
323
342
  // n-dimensional tensor
324
343
  struct ggml_tensor {
325
- enum ggml_type type;
344
+ enum ggml_type type;
345
+ enum ggml_backend backend;
326
346
 
327
347
  int n_dims;
328
348
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +373,7 @@ extern "C" {
353
373
 
354
374
  char name[32];
355
375
 
356
- char padding[8]; // TODO: remove and add padding to name?
376
+ char padding[16];
357
377
  };
358
378
 
359
379
  // computation graph
@@ -497,6 +517,29 @@ extern "C" {
497
517
  struct ggml_tensor * a,
498
518
  struct ggml_tensor * b);
499
519
 
520
+ GGML_API struct ggml_tensor * ggml_add1(
521
+ struct ggml_context * ctx,
522
+ struct ggml_tensor * a,
523
+ struct ggml_tensor * b);
524
+
525
+ GGML_API struct ggml_tensor * ggml_acc(
526
+ struct ggml_context * ctx,
527
+ struct ggml_tensor * a,
528
+ struct ggml_tensor * b,
529
+ size_t nb1,
530
+ size_t nb2,
531
+ size_t nb3,
532
+ size_t offset);
533
+
534
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
535
+ struct ggml_context * ctx,
536
+ struct ggml_tensor * a,
537
+ struct ggml_tensor * b,
538
+ size_t nb1,
539
+ size_t nb2,
540
+ size_t nb3,
541
+ size_t offset);
542
+
500
543
  GGML_API struct ggml_tensor * ggml_sub(
501
544
  struct ggml_context * ctx,
502
545
  struct ggml_tensor * a,
@@ -520,12 +563,24 @@ extern "C" {
520
563
  struct ggml_context * ctx,
521
564
  struct ggml_tensor * a);
522
565
 
566
+ GGML_API struct ggml_tensor * ggml_log(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ GGML_API struct ggml_tensor * ggml_log_inplace(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
523
574
  // return scalar
524
- // TODO: compute sum along rows
525
575
  GGML_API struct ggml_tensor * ggml_sum(
526
576
  struct ggml_context * ctx,
527
577
  struct ggml_tensor * a);
528
578
 
579
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
580
+ GGML_API struct ggml_tensor * ggml_sum_rows(
581
+ struct ggml_context * ctx,
582
+ struct ggml_tensor * a);
583
+
529
584
  // mean along rows
530
585
  GGML_API struct ggml_tensor * ggml_mean(
531
586
  struct ggml_context * ctx,
@@ -567,6 +622,13 @@ extern "C" {
567
622
  struct ggml_context * ctx,
568
623
  struct ggml_tensor * a);
569
624
 
625
+ // a - x
626
+ // b - dy
627
+ GGML_API struct ggml_tensor * ggml_silu_back(
628
+ struct ggml_context * ctx,
629
+ struct ggml_tensor * a,
630
+ struct ggml_tensor * b);
631
+
570
632
  // normalize along rows
571
633
  // TODO: eps is hardcoded to 1e-5 for now
572
634
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +639,13 @@ extern "C" {
577
639
  struct ggml_context * ctx,
578
640
  struct ggml_tensor * a);
579
641
 
642
+ // a - x
643
+ // b - dy
644
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
645
+ struct ggml_context * ctx,
646
+ struct ggml_tensor * a,
647
+ struct ggml_tensor * b);
648
+
580
649
  // A: m rows, n columns
581
650
  // B: p rows, n columns (i.e. we transpose it internally)
582
651
  // result is m columns, p rows
@@ -589,12 +658,66 @@ extern "C" {
589
658
  // operations on tensors without backpropagation
590
659
  //
591
660
 
592
- // in-place, returns view(a)
593
661
  GGML_API struct ggml_tensor * ggml_scale(
594
662
  struct ggml_context * ctx,
595
663
  struct ggml_tensor * a,
596
664
  struct ggml_tensor * b);
597
665
 
666
+ // in-place, returns view(a)
667
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a,
670
+ struct ggml_tensor * b);
671
+
672
+ // b -> view(a,offset,nb1,nb2,3), return modified a
673
+ GGML_API struct ggml_tensor * ggml_set(
674
+ struct ggml_context * ctx,
675
+ struct ggml_tensor * a,
676
+ struct ggml_tensor * b,
677
+ size_t nb1,
678
+ size_t nb2,
679
+ size_t nb3,
680
+ size_t offset);
681
+
682
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
683
+ GGML_API struct ggml_tensor * ggml_set_inplace(
684
+ struct ggml_context * ctx,
685
+ struct ggml_tensor * a,
686
+ struct ggml_tensor * b,
687
+ size_t nb1,
688
+ size_t nb2,
689
+ size_t nb3,
690
+ size_t offset);
691
+
692
+ GGML_API struct ggml_tensor * ggml_set_1d(
693
+ struct ggml_context * ctx,
694
+ struct ggml_tensor * a,
695
+ struct ggml_tensor * b,
696
+ size_t offset);
697
+
698
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ struct ggml_tensor * b,
702
+ size_t offset);
703
+
704
+ // b -> view(a,offset,nb1,nb2,3), return modified a
705
+ GGML_API struct ggml_tensor * ggml_set_2d(
706
+ struct ggml_context * ctx,
707
+ struct ggml_tensor * a,
708
+ struct ggml_tensor * b,
709
+ size_t nb1,
710
+ size_t offset);
711
+
712
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
713
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * a,
716
+ struct ggml_tensor * b,
717
+ size_t nb1,
718
+ size_t offset);
719
+
720
+
598
721
  // a -> b, return view(b)
599
722
  GGML_API struct ggml_tensor * ggml_cpy(
600
723
  struct ggml_context * ctx,
@@ -615,6 +738,11 @@ extern "C" {
615
738
 
616
739
  // return view(a)
617
740
  // TODO: when we start computing gradient, make a copy instead of view
741
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
742
+ struct ggml_context * ctx,
743
+ struct ggml_tensor * a,
744
+ int64_t ne0);
745
+
618
746
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
747
  struct ggml_context * ctx,
620
748
  struct ggml_tensor * a,
@@ -630,6 +758,14 @@ extern "C" {
630
758
  int64_t ne1,
631
759
  int64_t ne2);
632
760
 
761
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
762
+ struct ggml_context * ctx,
763
+ struct ggml_tensor * a,
764
+ int64_t ne0,
765
+ int64_t ne1,
766
+ int64_t ne2,
767
+ int64_t ne3);
768
+
633
769
  // offset in bytes
634
770
  GGML_API struct ggml_tensor * ggml_view_1d(
635
771
  struct ggml_context * ctx,
@@ -655,6 +791,18 @@ extern "C" {
655
791
  size_t nb2, // slice stride in bytes
656
792
  size_t offset);
657
793
 
794
+ GGML_API struct ggml_tensor * ggml_view_4d(
795
+ struct ggml_context * ctx,
796
+ struct ggml_tensor * a,
797
+ int64_t ne0,
798
+ int64_t ne1,
799
+ int64_t ne2,
800
+ int64_t ne3,
801
+ size_t nb1, // row stride in bytes
802
+ size_t nb2, // slice stride in bytes
803
+ size_t nb3,
804
+ size_t offset);
805
+
658
806
  GGML_API struct ggml_tensor * ggml_permute(
659
807
  struct ggml_context * ctx,
660
808
  struct ggml_tensor * a,
@@ -673,20 +821,50 @@ extern "C" {
673
821
  struct ggml_tensor * a,
674
822
  struct ggml_tensor * b);
675
823
 
824
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
825
+ struct ggml_context * ctx,
826
+ struct ggml_tensor * a,
827
+ struct ggml_tensor * b,
828
+ struct ggml_tensor * c);
829
+
830
+ GGML_API struct ggml_tensor * ggml_diag(
831
+ struct ggml_context * ctx,
832
+ struct ggml_tensor * a);
833
+
676
834
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
835
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
836
  struct ggml_context * ctx,
680
837
  struct ggml_tensor * a,
681
838
  int n_past);
682
839
 
683
840
  // in-place, returns view(a)
841
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
842
+ struct ggml_context * ctx,
843
+ struct ggml_tensor * a,
844
+ int n_past);
845
+
846
+ // set elements above the diagonal to 0
847
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
848
+ struct ggml_context * ctx,
849
+ struct ggml_tensor * a,
850
+ int n_past);
851
+
852
+ // in-place, returns view(a)
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
854
+ struct ggml_context * ctx,
855
+ struct ggml_tensor * a,
856
+ int n_past);
857
+
684
858
  GGML_API struct ggml_tensor * ggml_soft_max(
685
859
  struct ggml_context * ctx,
686
860
  struct ggml_tensor * a);
687
861
 
688
- // rotary position embedding
689
862
  // in-place, returns view(a)
863
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
864
+ struct ggml_context * ctx,
865
+ struct ggml_tensor * a);
866
+
867
+ // rotary position embedding
690
868
  // if mode & 1 == 1, skip n_past elements
691
869
  // if mode & 2 == 1, GPT-NeoX style
692
870
  // TODO: avoid creating a new tensor every time
@@ -697,13 +875,39 @@ extern "C" {
697
875
  int n_dims,
698
876
  int mode);
699
877
 
878
+ // in-place, returns view(a)
879
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
880
+ struct ggml_context * ctx,
881
+ struct ggml_tensor * a,
882
+ int n_past,
883
+ int n_dims,
884
+ int mode);
885
+
886
+ // rotary position embedding backward, i.e compute dx from dy
887
+ // a - dy
888
+ GGML_API struct ggml_tensor * ggml_rope_back(
889
+ struct ggml_context * ctx,
890
+ struct ggml_tensor * a,
891
+ int n_past,
892
+ int n_dims,
893
+ int mode);
894
+
700
895
  // alibi position embedding
701
896
  // in-place, returns view(a)
702
897
  struct ggml_tensor * ggml_alibi(
703
898
  struct ggml_context * ctx,
704
899
  struct ggml_tensor * a,
705
900
  int n_past,
706
- int n_head);
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
707
911
 
708
912
  // padding = 1
709
913
  // TODO: we don't support extra parameters for now
@@ -741,13 +945,13 @@ extern "C" {
741
945
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
946
  struct ggml_context * ctx,
743
947
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
948
+ ggml_unary_op_f32_t fun);
745
949
 
746
950
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
951
  struct ggml_context * ctx,
748
952
  struct ggml_tensor * a,
749
953
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
954
+ ggml_binary_op_f32_t fun);
751
955
 
752
956
  //
753
957
  // automatic differentiation
@@ -876,7 +1080,6 @@ extern "C" {
876
1080
 
877
1081
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1082
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1083
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1084
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1085
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
package/cpp/whisper.cpp CHANGED
@@ -139,7 +139,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
139
139
  { "hi", { 17, "hindi", } },
140
140
  { "fi", { 18, "finnish", } },
141
141
  { "vi", { 19, "vietnamese", } },
142
- { "iw", { 20, "hebrew", } },
142
+ { "he", { 20, "hebrew", } },
143
143
  { "uk", { 21, "ukrainian", } },
144
144
  { "el", { 22, "greek", } },
145
145
  { "ms", { 23, "malay", } },
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
291
291
  { MODEL_LARGE, 1124ull*MB },
292
292
  },
293
293
  },
294
- { GGML_TYPE_Q4_2,
295
- {
296
- { MODEL_TINY, 26ull*MB },
297
- { MODEL_BASE, 50ull*MB },
298
- { MODEL_SMALL, 154ull*MB },
299
- { MODEL_MEDIUM, 470ull*MB },
300
- { MODEL_LARGE, 940ull*MB },
301
- },
302
- },
303
294
  { GGML_TYPE_Q5_0,
304
295
  {
305
296
  { MODEL_TINY, 30ull*MB },
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
861
852
  model.type = e_model::MODEL_LARGE;
862
853
  }
863
854
 
855
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
856
+
857
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
858
+
864
859
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
865
860
  // in order to save memory and also to speed up the computation
866
861
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
882
877
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
883
878
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
884
879
  fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
880
+ fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
885
881
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
886
882
 
887
883
  // print memory requirements
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1106
1102
  ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1107
1103
  }
1108
1104
 
1109
- ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
1105
+ ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1110
1106
 
1111
1107
  fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1112
1108
  }
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
1554
1550
  Qcur),
1555
1551
  Qcur);
1556
1552
 
1557
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1553
+ //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1558
1554
 
1559
1555
  // note: no bias for Key
1560
1556
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1561
1557
  layer.attn_k_w,
1562
1558
  cur);
1563
1559
 
1564
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1560
+ //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1565
1561
 
1566
1562
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1567
1563
  layer.attn_v_w,
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
1621
1617
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1622
1618
 
1623
1619
  struct ggml_tensor * KQ_scaled =
1624
- ggml_scale(ctx0,
1620
+ ggml_scale_inplace(ctx0,
1625
1621
  KQ,
1626
1622
  ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1627
1623
  );
1628
1624
 
1629
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1625
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
1630
1626
 
1631
1627
  struct ggml_tensor * V =
1632
1628
  ggml_cpy(ctx0,
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
1809
1805
  layer.cross_attn_k_w,
1810
1806
  cur);
1811
1807
 
1812
- Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1808
+ Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1813
1809
 
1814
1810
  wstate.use_buf(ctx0, 1);
1815
1811
 
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
1956
1952
  Qcur),
1957
1953
  Qcur);
1958
1954
 
1959
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1955
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1960
1956
 
1961
1957
  // note: no bias for Key
1962
1958
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1963
1959
  layer.attn_k_w,
1964
1960
  cur);
1965
1961
 
1966
- Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1962
+ Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1967
1963
 
1968
1964
  // store key and value to memory
1969
1965
  {
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
2012
2008
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2013
2009
 
2014
2010
  //struct ggml_tensor * KQ_scaled =
2015
- // ggml_scale(ctx0,
2011
+ // ggml_scale_inplace(ctx0,
2016
2012
  // KQ,
2017
2013
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2018
2014
  // );
2019
2015
 
2020
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
2016
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
2021
2017
 
2022
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2018
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2023
2019
 
2024
2020
  struct ggml_tensor * V =
2025
2021
  ggml_view_3d(ctx0, kv_self.v,
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
2083
2079
  Qcur),
2084
2080
  Qcur);
2085
2081
 
2086
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2082
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2087
2083
 
2088
2084
  // Kcross is already scaled
2089
2085
  struct ggml_tensor * Kcross =
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
2123
2119
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2124
2120
 
2125
2121
  //struct ggml_tensor * KQ_scaled =
2126
- // ggml_scale(ctx0,
2122
+ // ggml_scale_inplace(ctx0,
2127
2123
  // KQ,
2128
2124
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2129
2125
  // );
2130
2126
 
2131
2127
  // no masking for cross-attention
2132
- //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
2128
+ //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2133
2129
 
2134
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
2130
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
2135
2131
 
2136
2132
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2137
2133
 
@@ -2602,6 +2598,15 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
2602
2598
  path_bin = path_bin.substr(0, pos);
2603
2599
  }
2604
2600
 
2601
+ // match "-qx_x"
2602
+ pos = path_bin.rfind('-');
2603
+ if (pos != std::string::npos) {
2604
+ auto sub = path_bin.substr(pos);
2605
+ if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
2606
+ path_bin = path_bin.substr(0, pos);
2607
+ }
2608
+ }
2609
+
2605
2610
  path_bin += "-encoder.mlmodelc";
2606
2611
 
2607
2612
  return path_bin;
@@ -2847,6 +2852,12 @@ void whisper_free(struct whisper_context * ctx) {
2847
2852
  }
2848
2853
  }
2849
2854
 
2855
+ void whisper_free_params(struct whisper_full_params * params) {
2856
+ if (params) {
2857
+ delete params;
2858
+ }
2859
+ }
2860
+
2850
2861
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2851
2862
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
2852
2863
  fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
@@ -3280,6 +3291,14 @@ const char * whisper_print_system_info(void) {
3280
3291
 
3281
3292
  ////////////////////////////////////////////////////////////////////////////
3282
3293
 
3294
+ struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
3295
+ struct whisper_full_params params = whisper_full_default_params(strategy);
3296
+
3297
+ struct whisper_full_params* result = new whisper_full_params();
3298
+ *result = params;
3299
+ return result;
3300
+ }
3301
+
3283
3302
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
3284
3303
  struct whisper_full_params result = {
3285
3304
  /*.strategy =*/ strategy,
@@ -4903,7 +4922,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4903
4922
  // b: N*N*sizeof(float)
4904
4923
  // c: N*N*sizeof(float)
4905
4924
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4906
- std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4925
+ std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
4907
4926
 
4908
4927
  // put a bunch of random data in the buffer
4909
4928
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -4911,7 +4930,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4911
4930
  for (int j = 0; j < (int) sizes.size(); j++) {
4912
4931
  int n_q4_0 = 0;
4913
4932
  int n_q4_1 = 0;
4914
- int n_q4_2 = 0;
4915
4933
  int n_q5_0 = 0;
4916
4934
  int n_q5_1 = 0;
4917
4935
  int n_q8_0 = 0;
@@ -4921,7 +4939,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4921
4939
  // GFLOPS/s
4922
4940
  double s_q4_0 = 0.0;
4923
4941
  double s_q4_1 = 0.0;
4924
- double s_q4_2 = 0.0;
4925
4942
  double s_q5_0 = 0.0;
4926
4943
  double s_q5_1 = 0.0;
4927
4944
  double s_q8_0 = 0.0;
@@ -4930,18 +4947,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4930
4947
 
4931
4948
  const size_t N = sizes[j];
4932
4949
 
4933
- for (int k = 0; k < 8; ++k) {
4950
+ for (int k = 0; k < 7; ++k) {
4934
4951
  const ggml_type wtype =
4935
4952
  k == 0 ? GGML_TYPE_Q4_0 :
4936
4953
  k == 1 ? GGML_TYPE_Q4_1 :
4937
- k == 2 ? GGML_TYPE_Q4_2 :
4938
- k == 3 ? GGML_TYPE_Q5_0 :
4939
- k == 4 ? GGML_TYPE_Q5_1 :
4940
- k == 5 ? GGML_TYPE_Q8_0 :
4941
- k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4954
+ k == 2 ? GGML_TYPE_Q5_0 :
4955
+ k == 3 ? GGML_TYPE_Q5_1 :
4956
+ k == 4 ? GGML_TYPE_Q8_0 :
4957
+ k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4942
4958
 
4943
- double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
4944
- int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
4959
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
4960
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
4945
4961
 
4946
4962
  struct ggml_init_params gparams = {
4947
4963
  /*.mem_size =*/ buf.size(),
@@ -4985,9 +5001,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4985
5001
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4986
5002
  }
4987
5003
 
4988
- // Q4_0 | Q4_1 | Q4_2
4989
- snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
4990
- N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
5004
+ // Q4_0 | Q4_1
5005
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
5006
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
4991
5007
  s += strbuf;
4992
5008
 
4993
5009
  // Q5_0 | Q5_1 | Q8_0