whisper.rn 0.3.0-rc.0 → 0.3.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml.h CHANGED
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,9 +305,13 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
298
316
  GGML_OP_CONV_1D_1S,
299
317
  GGML_OP_CONV_1D_2S,
@@ -322,7 +340,8 @@ extern "C" {
322
340
 
323
341
  // n-dimensional tensor
324
342
  struct ggml_tensor {
325
- enum ggml_type type;
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
326
345
 
327
346
  int n_dims;
328
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +372,7 @@ extern "C" {
353
372
 
354
373
  char name[32];
355
374
 
356
- char padding[8]; // TODO: remove and add padding to name?
375
+ char padding[16];
357
376
  };
358
377
 
359
378
  // computation graph
@@ -497,6 +516,29 @@ extern "C" {
497
516
  struct ggml_tensor * a,
498
517
  struct ggml_tensor * b);
499
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
500
542
  GGML_API struct ggml_tensor * ggml_sub(
501
543
  struct ggml_context * ctx,
502
544
  struct ggml_tensor * a,
@@ -520,12 +562,24 @@ extern "C" {
520
562
  struct ggml_context * ctx,
521
563
  struct ggml_tensor * a);
522
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
523
573
  // return scalar
524
- // TODO: compute sum along rows
525
574
  GGML_API struct ggml_tensor * ggml_sum(
526
575
  struct ggml_context * ctx,
527
576
  struct ggml_tensor * a);
528
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
529
583
  // mean along rows
530
584
  GGML_API struct ggml_tensor * ggml_mean(
531
585
  struct ggml_context * ctx,
@@ -567,6 +621,13 @@ extern "C" {
567
621
  struct ggml_context * ctx,
568
622
  struct ggml_tensor * a);
569
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
570
631
  // normalize along rows
571
632
  // TODO: eps is hardcoded to 1e-5 for now
572
633
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +638,13 @@ extern "C" {
577
638
  struct ggml_context * ctx,
578
639
  struct ggml_tensor * a);
579
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
580
648
  // A: m rows, n columns
581
649
  // B: p rows, n columns (i.e. we transpose it internally)
582
650
  // result is m columns, p rows
@@ -589,12 +657,66 @@ extern "C" {
589
657
  // operations on tensors without backpropagation
590
658
  //
591
659
 
592
- // in-place, returns view(a)
593
660
  GGML_API struct ggml_tensor * ggml_scale(
594
661
  struct ggml_context * ctx,
595
662
  struct ggml_tensor * a,
596
663
  struct ggml_tensor * b);
597
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
598
720
  // a -> b, return view(b)
599
721
  GGML_API struct ggml_tensor * ggml_cpy(
600
722
  struct ggml_context * ctx,
@@ -615,6 +737,11 @@ extern "C" {
615
737
 
616
738
  // return view(a)
617
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
618
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
746
  struct ggml_context * ctx,
620
747
  struct ggml_tensor * a,
@@ -630,6 +757,14 @@ extern "C" {
630
757
  int64_t ne1,
631
758
  int64_t ne2);
632
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
633
768
  // offset in bytes
634
769
  GGML_API struct ggml_tensor * ggml_view_1d(
635
770
  struct ggml_context * ctx,
@@ -655,6 +790,18 @@ extern "C" {
655
790
  size_t nb2, // slice stride in bytes
656
791
  size_t offset);
657
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
658
805
  GGML_API struct ggml_tensor * ggml_permute(
659
806
  struct ggml_context * ctx,
660
807
  struct ggml_tensor * a,
@@ -673,20 +820,50 @@ extern "C" {
673
820
  struct ggml_tensor * a,
674
821
  struct ggml_tensor * b);
675
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
676
833
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
835
  struct ggml_context * ctx,
680
836
  struct ggml_tensor * a,
681
837
  int n_past);
682
838
 
683
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
684
857
  GGML_API struct ggml_tensor * ggml_soft_max(
685
858
  struct ggml_context * ctx,
686
859
  struct ggml_tensor * a);
687
860
 
688
- // rotary position embedding
689
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
690
867
  // if mode & 1 == 1, skip n_past elements
691
868
  // if mode & 2 == 1, GPT-NeoX style
692
869
  // TODO: avoid creating a new tensor every time
@@ -697,6 +874,23 @@ extern "C" {
697
874
  int n_dims,
698
875
  int mode);
699
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
700
894
  // alibi position embedding
701
895
  // in-place, returns view(a)
702
896
  struct ggml_tensor * ggml_alibi(
@@ -741,13 +935,13 @@ extern "C" {
741
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
936
  struct ggml_context * ctx,
743
937
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
938
+ ggml_unary_op_f32_t fun);
745
939
 
746
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
941
  struct ggml_context * ctx,
748
942
  struct ggml_tensor * a,
749
943
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
944
+ ggml_binary_op_f32_t fun);
751
945
 
752
946
  //
753
947
  // automatic differentiation
@@ -876,7 +1070,6 @@ extern "C" {
876
1070
 
877
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
package/cpp/whisper.cpp CHANGED
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
291
291
  { MODEL_LARGE, 1124ull*MB },
292
292
  },
293
293
  },
294
- { GGML_TYPE_Q4_2,
295
- {
296
- { MODEL_TINY, 26ull*MB },
297
- { MODEL_BASE, 50ull*MB },
298
- { MODEL_SMALL, 154ull*MB },
299
- { MODEL_MEDIUM, 470ull*MB },
300
- { MODEL_LARGE, 940ull*MB },
301
- },
302
- },
303
294
  { GGML_TYPE_Q5_0,
304
295
  {
305
296
  { MODEL_TINY, 30ull*MB },
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
861
852
  model.type = e_model::MODEL_LARGE;
862
853
  }
863
854
 
855
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
856
+
857
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
858
+
864
859
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
865
860
  // in order to save memory and also to speed up the computation
866
861
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
882
877
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
883
878
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
884
879
  fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
880
+ fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
885
881
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
886
882
 
887
883
  // print memory requirements
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1106
1102
  ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1107
1103
  }
1108
1104
 
1109
- ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
1105
+ ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1110
1106
 
1111
1107
  fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1112
1108
  }
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
1554
1550
  Qcur),
1555
1551
  Qcur);
1556
1552
 
1557
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1553
+ //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1558
1554
 
1559
1555
  // note: no bias for Key
1560
1556
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1561
1557
  layer.attn_k_w,
1562
1558
  cur);
1563
1559
 
1564
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1560
+ //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1565
1561
 
1566
1562
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1567
1563
  layer.attn_v_w,
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
1621
1617
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1622
1618
 
1623
1619
  struct ggml_tensor * KQ_scaled =
1624
- ggml_scale(ctx0,
1620
+ ggml_scale_inplace(ctx0,
1625
1621
  KQ,
1626
1622
  ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1627
1623
  );
1628
1624
 
1629
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1625
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
1630
1626
 
1631
1627
  struct ggml_tensor * V =
1632
1628
  ggml_cpy(ctx0,
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
1809
1805
  layer.cross_attn_k_w,
1810
1806
  cur);
1811
1807
 
1812
- Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1808
+ Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
1813
1809
 
1814
1810
  wstate.use_buf(ctx0, 1);
1815
1811
 
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
1956
1952
  Qcur),
1957
1953
  Qcur);
1958
1954
 
1959
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1955
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1960
1956
 
1961
1957
  // note: no bias for Key
1962
1958
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1963
1959
  layer.attn_k_w,
1964
1960
  cur);
1965
1961
 
1966
- Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1962
+ Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1967
1963
 
1968
1964
  // store key and value to memory
1969
1965
  {
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
2012
2008
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2013
2009
 
2014
2010
  //struct ggml_tensor * KQ_scaled =
2015
- // ggml_scale(ctx0,
2011
+ // ggml_scale_inplace(ctx0,
2016
2012
  // KQ,
2017
2013
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2018
2014
  // );
2019
2015
 
2020
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
2016
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
2021
2017
 
2022
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2018
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2023
2019
 
2024
2020
  struct ggml_tensor * V =
2025
2021
  ggml_view_3d(ctx0, kv_self.v,
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
2083
2079
  Qcur),
2084
2080
  Qcur);
2085
2081
 
2086
- Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2082
+ Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2087
2083
 
2088
2084
  // Kcross is already scaled
2089
2085
  struct ggml_tensor * Kcross =
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
2123
2119
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2124
2120
 
2125
2121
  //struct ggml_tensor * KQ_scaled =
2126
- // ggml_scale(ctx0,
2122
+ // ggml_scale_inplace(ctx0,
2127
2123
  // KQ,
2128
2124
  // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2129
2125
  // );
2130
2126
 
2131
2127
  // no masking for cross-attention
2132
- //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
2128
+ //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2133
2129
 
2134
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
2130
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
2135
2131
 
2136
2132
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2137
2133
 
@@ -2602,6 +2598,15 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
2602
2598
  path_bin = path_bin.substr(0, pos);
2603
2599
  }
2604
2600
 
2601
+ // match "-qx_x"
2602
+ pos = path_bin.rfind('-');
2603
+ if (pos != std::string::npos) {
2604
+ auto sub = path_bin.substr(pos);
2605
+ if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
2606
+ path_bin = path_bin.substr(0, pos);
2607
+ }
2608
+ }
2609
+
2605
2610
  path_bin += "-encoder.mlmodelc";
2606
2611
 
2607
2612
  return path_bin;
@@ -4903,7 +4908,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4903
4908
  // b: N*N*sizeof(float)
4904
4909
  // c: N*N*sizeof(float)
4905
4910
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4906
- std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4911
+ std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
4907
4912
 
4908
4913
  // put a bunch of random data in the buffer
4909
4914
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -4911,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4911
4916
  for (int j = 0; j < (int) sizes.size(); j++) {
4912
4917
  int n_q4_0 = 0;
4913
4918
  int n_q4_1 = 0;
4914
- int n_q4_2 = 0;
4915
4919
  int n_q5_0 = 0;
4916
4920
  int n_q5_1 = 0;
4917
4921
  int n_q8_0 = 0;
@@ -4921,7 +4925,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4921
4925
  // GFLOPS/s
4922
4926
  double s_q4_0 = 0.0;
4923
4927
  double s_q4_1 = 0.0;
4924
- double s_q4_2 = 0.0;
4925
4928
  double s_q5_0 = 0.0;
4926
4929
  double s_q5_1 = 0.0;
4927
4930
  double s_q8_0 = 0.0;
@@ -4930,18 +4933,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4930
4933
 
4931
4934
  const size_t N = sizes[j];
4932
4935
 
4933
- for (int k = 0; k < 8; ++k) {
4936
+ for (int k = 0; k < 7; ++k) {
4934
4937
  const ggml_type wtype =
4935
4938
  k == 0 ? GGML_TYPE_Q4_0 :
4936
4939
  k == 1 ? GGML_TYPE_Q4_1 :
4937
- k == 2 ? GGML_TYPE_Q4_2 :
4938
- k == 3 ? GGML_TYPE_Q5_0 :
4939
- k == 4 ? GGML_TYPE_Q5_1 :
4940
- k == 5 ? GGML_TYPE_Q8_0 :
4941
- k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4940
+ k == 2 ? GGML_TYPE_Q5_0 :
4941
+ k == 3 ? GGML_TYPE_Q5_1 :
4942
+ k == 4 ? GGML_TYPE_Q8_0 :
4943
+ k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4942
4944
 
4943
- double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
4944
- int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
4945
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
4946
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
4945
4947
 
4946
4948
  struct ggml_init_params gparams = {
4947
4949
  /*.mem_size =*/ buf.size(),
@@ -4985,9 +4987,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4985
4987
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4986
4988
  }
4987
4989
 
4988
- // Q4_0 | Q4_1 | Q4_2
4989
- snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
4990
- N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
4990
+ // Q4_0 | Q4_1
4991
+ snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
4992
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
4991
4993
  s += strbuf;
4992
4994
 
4993
4995
  // Q5_0 | Q5_1 | Q8_0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.rn",
3
- "version": "0.3.0-rc.0",
3
+ "version": "0.3.0-rc.2",
4
4
  "description": "React Native binding of whisper.cpp",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",
@@ -14,6 +14,7 @@
14
14
  "android",
15
15
  "ios",
16
16
  "cpp/*.*",
17
+ "cpp/coreml/*.*",
17
18
  "*.podspec",
18
19
  "!lib/typescript/example",
19
20
  "!ios/build",