whisper.rn 0.3.0-rc.1 → 0.3.0-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/cpp/ggml.c +6486 -4317
- package/cpp/ggml.h +205 -12
- package/cpp/whisper.cpp +40 -38
- package/package.json +1 -1
package/cpp/ggml.h
CHANGED
|
@@ -190,9 +190,12 @@
|
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
|
191
191
|
#define GGML_FILE_VERSION 1
|
|
192
192
|
|
|
193
|
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
|
195
|
+
|
|
193
196
|
#define GGML_MAX_DIMS 4
|
|
194
197
|
#define GGML_MAX_NODES 4096
|
|
195
|
-
#define GGML_MAX_PARAMS
|
|
198
|
+
#define GGML_MAX_PARAMS 256
|
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
|
197
200
|
#define GGML_MAX_OPT 4
|
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
|
243
246
|
GGML_TYPE_COUNT,
|
|
244
247
|
};
|
|
245
248
|
|
|
249
|
+
enum ggml_backend {
|
|
250
|
+
GGML_BACKEND_CPU = 0,
|
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
|
252
|
+
};
|
|
253
|
+
|
|
246
254
|
// model file types
|
|
247
255
|
enum ggml_ftype {
|
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
|
263
270
|
|
|
264
271
|
GGML_OP_DUP,
|
|
265
272
|
GGML_OP_ADD,
|
|
273
|
+
GGML_OP_ADD1,
|
|
274
|
+
GGML_OP_ACC,
|
|
266
275
|
GGML_OP_SUB,
|
|
267
276
|
GGML_OP_MUL,
|
|
268
277
|
GGML_OP_DIV,
|
|
269
278
|
GGML_OP_SQR,
|
|
270
279
|
GGML_OP_SQRT,
|
|
280
|
+
GGML_OP_LOG,
|
|
271
281
|
GGML_OP_SUM,
|
|
282
|
+
GGML_OP_SUM_ROWS,
|
|
272
283
|
GGML_OP_MEAN,
|
|
273
284
|
GGML_OP_REPEAT,
|
|
274
285
|
GGML_OP_ABS,
|
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
|
278
289
|
GGML_OP_RELU,
|
|
279
290
|
GGML_OP_GELU,
|
|
280
291
|
GGML_OP_SILU,
|
|
292
|
+
GGML_OP_SILU_BACK,
|
|
281
293
|
GGML_OP_NORM, // normalize
|
|
282
294
|
GGML_OP_RMS_NORM,
|
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
|
283
296
|
|
|
284
297
|
GGML_OP_MUL_MAT,
|
|
285
298
|
|
|
286
299
|
GGML_OP_SCALE,
|
|
300
|
+
GGML_OP_SET,
|
|
287
301
|
GGML_OP_CPY,
|
|
288
302
|
GGML_OP_CONT,
|
|
289
303
|
GGML_OP_RESHAPE,
|
|
@@ -291,9 +305,13 @@ extern "C" {
|
|
|
291
305
|
GGML_OP_PERMUTE,
|
|
292
306
|
GGML_OP_TRANSPOSE,
|
|
293
307
|
GGML_OP_GET_ROWS,
|
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
|
309
|
+
GGML_OP_DIAG,
|
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
|
295
312
|
GGML_OP_SOFT_MAX,
|
|
296
313
|
GGML_OP_ROPE,
|
|
314
|
+
GGML_OP_ROPE_BACK,
|
|
297
315
|
GGML_OP_ALIBI,
|
|
298
316
|
GGML_OP_CONV_1D_1S,
|
|
299
317
|
GGML_OP_CONV_1D_2S,
|
|
@@ -322,7 +340,8 @@ extern "C" {
|
|
|
322
340
|
|
|
323
341
|
// n-dimensional tensor
|
|
324
342
|
struct ggml_tensor {
|
|
325
|
-
enum ggml_type
|
|
343
|
+
enum ggml_type type;
|
|
344
|
+
enum ggml_backend backend;
|
|
326
345
|
|
|
327
346
|
int n_dims;
|
|
328
347
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
@@ -353,7 +372,7 @@ extern "C" {
|
|
|
353
372
|
|
|
354
373
|
char name[32];
|
|
355
374
|
|
|
356
|
-
char padding[
|
|
375
|
+
char padding[16];
|
|
357
376
|
};
|
|
358
377
|
|
|
359
378
|
// computation graph
|
|
@@ -497,6 +516,29 @@ extern "C" {
|
|
|
497
516
|
struct ggml_tensor * a,
|
|
498
517
|
struct ggml_tensor * b);
|
|
499
518
|
|
|
519
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
|
520
|
+
struct ggml_context * ctx,
|
|
521
|
+
struct ggml_tensor * a,
|
|
522
|
+
struct ggml_tensor * b);
|
|
523
|
+
|
|
524
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
|
525
|
+
struct ggml_context * ctx,
|
|
526
|
+
struct ggml_tensor * a,
|
|
527
|
+
struct ggml_tensor * b,
|
|
528
|
+
size_t nb1,
|
|
529
|
+
size_t nb2,
|
|
530
|
+
size_t nb3,
|
|
531
|
+
size_t offset);
|
|
532
|
+
|
|
533
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
|
534
|
+
struct ggml_context * ctx,
|
|
535
|
+
struct ggml_tensor * a,
|
|
536
|
+
struct ggml_tensor * b,
|
|
537
|
+
size_t nb1,
|
|
538
|
+
size_t nb2,
|
|
539
|
+
size_t nb3,
|
|
540
|
+
size_t offset);
|
|
541
|
+
|
|
500
542
|
GGML_API struct ggml_tensor * ggml_sub(
|
|
501
543
|
struct ggml_context * ctx,
|
|
502
544
|
struct ggml_tensor * a,
|
|
@@ -520,12 +562,24 @@ extern "C" {
|
|
|
520
562
|
struct ggml_context * ctx,
|
|
521
563
|
struct ggml_tensor * a);
|
|
522
564
|
|
|
565
|
+
GGML_API struct ggml_tensor * ggml_log(
|
|
566
|
+
struct ggml_context * ctx,
|
|
567
|
+
struct ggml_tensor * a);
|
|
568
|
+
|
|
569
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
|
570
|
+
struct ggml_context * ctx,
|
|
571
|
+
struct ggml_tensor * a);
|
|
572
|
+
|
|
523
573
|
// return scalar
|
|
524
|
-
// TODO: compute sum along rows
|
|
525
574
|
GGML_API struct ggml_tensor * ggml_sum(
|
|
526
575
|
struct ggml_context * ctx,
|
|
527
576
|
struct ggml_tensor * a);
|
|
528
577
|
|
|
578
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
|
579
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
|
580
|
+
struct ggml_context * ctx,
|
|
581
|
+
struct ggml_tensor * a);
|
|
582
|
+
|
|
529
583
|
// mean along rows
|
|
530
584
|
GGML_API struct ggml_tensor * ggml_mean(
|
|
531
585
|
struct ggml_context * ctx,
|
|
@@ -567,6 +621,13 @@ extern "C" {
|
|
|
567
621
|
struct ggml_context * ctx,
|
|
568
622
|
struct ggml_tensor * a);
|
|
569
623
|
|
|
624
|
+
// a - x
|
|
625
|
+
// b - dy
|
|
626
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
627
|
+
struct ggml_context * ctx,
|
|
628
|
+
struct ggml_tensor * a,
|
|
629
|
+
struct ggml_tensor * b);
|
|
630
|
+
|
|
570
631
|
// normalize along rows
|
|
571
632
|
// TODO: eps is hardcoded to 1e-5 for now
|
|
572
633
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
@@ -577,6 +638,13 @@ extern "C" {
|
|
|
577
638
|
struct ggml_context * ctx,
|
|
578
639
|
struct ggml_tensor * a);
|
|
579
640
|
|
|
641
|
+
// a - x
|
|
642
|
+
// b - dy
|
|
643
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
644
|
+
struct ggml_context * ctx,
|
|
645
|
+
struct ggml_tensor * a,
|
|
646
|
+
struct ggml_tensor * b);
|
|
647
|
+
|
|
580
648
|
// A: m rows, n columns
|
|
581
649
|
// B: p rows, n columns (i.e. we transpose it internally)
|
|
582
650
|
// result is m columns, p rows
|
|
@@ -589,12 +657,66 @@ extern "C" {
|
|
|
589
657
|
// operations on tensors without backpropagation
|
|
590
658
|
//
|
|
591
659
|
|
|
592
|
-
// in-place, returns view(a)
|
|
593
660
|
GGML_API struct ggml_tensor * ggml_scale(
|
|
594
661
|
struct ggml_context * ctx,
|
|
595
662
|
struct ggml_tensor * a,
|
|
596
663
|
struct ggml_tensor * b);
|
|
597
664
|
|
|
665
|
+
// in-place, returns view(a)
|
|
666
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
|
667
|
+
struct ggml_context * ctx,
|
|
668
|
+
struct ggml_tensor * a,
|
|
669
|
+
struct ggml_tensor * b);
|
|
670
|
+
|
|
671
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
672
|
+
GGML_API struct ggml_tensor * ggml_set(
|
|
673
|
+
struct ggml_context * ctx,
|
|
674
|
+
struct ggml_tensor * a,
|
|
675
|
+
struct ggml_tensor * b,
|
|
676
|
+
size_t nb1,
|
|
677
|
+
size_t nb2,
|
|
678
|
+
size_t nb3,
|
|
679
|
+
size_t offset);
|
|
680
|
+
|
|
681
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
|
682
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
|
683
|
+
struct ggml_context * ctx,
|
|
684
|
+
struct ggml_tensor * a,
|
|
685
|
+
struct ggml_tensor * b,
|
|
686
|
+
size_t nb1,
|
|
687
|
+
size_t nb2,
|
|
688
|
+
size_t nb3,
|
|
689
|
+
size_t offset);
|
|
690
|
+
|
|
691
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
|
692
|
+
struct ggml_context * ctx,
|
|
693
|
+
struct ggml_tensor * a,
|
|
694
|
+
struct ggml_tensor * b,
|
|
695
|
+
size_t offset);
|
|
696
|
+
|
|
697
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
|
698
|
+
struct ggml_context * ctx,
|
|
699
|
+
struct ggml_tensor * a,
|
|
700
|
+
struct ggml_tensor * b,
|
|
701
|
+
size_t offset);
|
|
702
|
+
|
|
703
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
704
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
|
705
|
+
struct ggml_context * ctx,
|
|
706
|
+
struct ggml_tensor * a,
|
|
707
|
+
struct ggml_tensor * b,
|
|
708
|
+
size_t nb1,
|
|
709
|
+
size_t offset);
|
|
710
|
+
|
|
711
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
|
712
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
|
713
|
+
struct ggml_context * ctx,
|
|
714
|
+
struct ggml_tensor * a,
|
|
715
|
+
struct ggml_tensor * b,
|
|
716
|
+
size_t nb1,
|
|
717
|
+
size_t offset);
|
|
718
|
+
|
|
719
|
+
|
|
598
720
|
// a -> b, return view(b)
|
|
599
721
|
GGML_API struct ggml_tensor * ggml_cpy(
|
|
600
722
|
struct ggml_context * ctx,
|
|
@@ -615,6 +737,11 @@ extern "C" {
|
|
|
615
737
|
|
|
616
738
|
// return view(a)
|
|
617
739
|
// TODO: when we start computing gradient, make a copy instead of view
|
|
740
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
|
741
|
+
struct ggml_context * ctx,
|
|
742
|
+
struct ggml_tensor * a,
|
|
743
|
+
int64_t ne0);
|
|
744
|
+
|
|
618
745
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
|
619
746
|
struct ggml_context * ctx,
|
|
620
747
|
struct ggml_tensor * a,
|
|
@@ -630,6 +757,14 @@ extern "C" {
|
|
|
630
757
|
int64_t ne1,
|
|
631
758
|
int64_t ne2);
|
|
632
759
|
|
|
760
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
|
761
|
+
struct ggml_context * ctx,
|
|
762
|
+
struct ggml_tensor * a,
|
|
763
|
+
int64_t ne0,
|
|
764
|
+
int64_t ne1,
|
|
765
|
+
int64_t ne2,
|
|
766
|
+
int64_t ne3);
|
|
767
|
+
|
|
633
768
|
// offset in bytes
|
|
634
769
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
|
635
770
|
struct ggml_context * ctx,
|
|
@@ -655,6 +790,18 @@ extern "C" {
|
|
|
655
790
|
size_t nb2, // slice stride in bytes
|
|
656
791
|
size_t offset);
|
|
657
792
|
|
|
793
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
|
794
|
+
struct ggml_context * ctx,
|
|
795
|
+
struct ggml_tensor * a,
|
|
796
|
+
int64_t ne0,
|
|
797
|
+
int64_t ne1,
|
|
798
|
+
int64_t ne2,
|
|
799
|
+
int64_t ne3,
|
|
800
|
+
size_t nb1, // row stride in bytes
|
|
801
|
+
size_t nb2, // slice stride in bytes
|
|
802
|
+
size_t nb3,
|
|
803
|
+
size_t offset);
|
|
804
|
+
|
|
658
805
|
GGML_API struct ggml_tensor * ggml_permute(
|
|
659
806
|
struct ggml_context * ctx,
|
|
660
807
|
struct ggml_tensor * a,
|
|
@@ -673,20 +820,50 @@ extern "C" {
|
|
|
673
820
|
struct ggml_tensor * a,
|
|
674
821
|
struct ggml_tensor * b);
|
|
675
822
|
|
|
823
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
|
824
|
+
struct ggml_context * ctx,
|
|
825
|
+
struct ggml_tensor * a,
|
|
826
|
+
struct ggml_tensor * b,
|
|
827
|
+
struct ggml_tensor * c);
|
|
828
|
+
|
|
829
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
|
830
|
+
struct ggml_context * ctx,
|
|
831
|
+
struct ggml_tensor * a);
|
|
832
|
+
|
|
676
833
|
// set elements above the diagonal to -INF
|
|
677
|
-
// in-place, returns view(a)
|
|
678
834
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
|
679
835
|
struct ggml_context * ctx,
|
|
680
836
|
struct ggml_tensor * a,
|
|
681
837
|
int n_past);
|
|
682
838
|
|
|
683
839
|
// in-place, returns view(a)
|
|
840
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
|
841
|
+
struct ggml_context * ctx,
|
|
842
|
+
struct ggml_tensor * a,
|
|
843
|
+
int n_past);
|
|
844
|
+
|
|
845
|
+
// set elements above the diagonal to 0
|
|
846
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
|
847
|
+
struct ggml_context * ctx,
|
|
848
|
+
struct ggml_tensor * a,
|
|
849
|
+
int n_past);
|
|
850
|
+
|
|
851
|
+
// in-place, returns view(a)
|
|
852
|
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
|
853
|
+
struct ggml_context * ctx,
|
|
854
|
+
struct ggml_tensor * a,
|
|
855
|
+
int n_past);
|
|
856
|
+
|
|
684
857
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
|
685
858
|
struct ggml_context * ctx,
|
|
686
859
|
struct ggml_tensor * a);
|
|
687
860
|
|
|
688
|
-
// rotary position embedding
|
|
689
861
|
// in-place, returns view(a)
|
|
862
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
|
863
|
+
struct ggml_context * ctx,
|
|
864
|
+
struct ggml_tensor * a);
|
|
865
|
+
|
|
866
|
+
// rotary position embedding
|
|
690
867
|
// if mode & 1 == 1, skip n_past elements
|
|
691
868
|
// if mode & 2 == 1, GPT-NeoX style
|
|
692
869
|
// TODO: avoid creating a new tensor every time
|
|
@@ -697,6 +874,23 @@ extern "C" {
|
|
|
697
874
|
int n_dims,
|
|
698
875
|
int mode);
|
|
699
876
|
|
|
877
|
+
// in-place, returns view(a)
|
|
878
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
|
879
|
+
struct ggml_context * ctx,
|
|
880
|
+
struct ggml_tensor * a,
|
|
881
|
+
int n_past,
|
|
882
|
+
int n_dims,
|
|
883
|
+
int mode);
|
|
884
|
+
|
|
885
|
+
// rotary position embedding backward, i.e compute dx from dy
|
|
886
|
+
// a - dy
|
|
887
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
|
888
|
+
struct ggml_context * ctx,
|
|
889
|
+
struct ggml_tensor * a,
|
|
890
|
+
int n_past,
|
|
891
|
+
int n_dims,
|
|
892
|
+
int mode);
|
|
893
|
+
|
|
700
894
|
// alibi position embedding
|
|
701
895
|
// in-place, returns view(a)
|
|
702
896
|
struct ggml_tensor * ggml_alibi(
|
|
@@ -741,13 +935,13 @@ extern "C" {
|
|
|
741
935
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
|
742
936
|
struct ggml_context * ctx,
|
|
743
937
|
struct ggml_tensor * a,
|
|
744
|
-
|
|
938
|
+
ggml_unary_op_f32_t fun);
|
|
745
939
|
|
|
746
940
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
|
747
941
|
struct ggml_context * ctx,
|
|
748
942
|
struct ggml_tensor * a,
|
|
749
943
|
struct ggml_tensor * b,
|
|
750
|
-
|
|
944
|
+
ggml_binary_op_f32_t fun);
|
|
751
945
|
|
|
752
946
|
//
|
|
753
947
|
// automatic differentiation
|
|
@@ -876,7 +1070,6 @@ extern "C" {
|
|
|
876
1070
|
|
|
877
1071
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
878
1072
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
880
1073
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
881
1074
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
882
1075
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
package/cpp/whisper.cpp
CHANGED
|
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
291
291
|
{ MODEL_LARGE, 1124ull*MB },
|
|
292
292
|
},
|
|
293
293
|
},
|
|
294
|
-
{ GGML_TYPE_Q4_2,
|
|
295
|
-
{
|
|
296
|
-
{ MODEL_TINY, 26ull*MB },
|
|
297
|
-
{ MODEL_BASE, 50ull*MB },
|
|
298
|
-
{ MODEL_SMALL, 154ull*MB },
|
|
299
|
-
{ MODEL_MEDIUM, 470ull*MB },
|
|
300
|
-
{ MODEL_LARGE, 940ull*MB },
|
|
301
|
-
},
|
|
302
|
-
},
|
|
303
294
|
{ GGML_TYPE_Q5_0,
|
|
304
295
|
{
|
|
305
296
|
{ MODEL_TINY, 30ull*MB },
|
|
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
861
852
|
model.type = e_model::MODEL_LARGE;
|
|
862
853
|
}
|
|
863
854
|
|
|
855
|
+
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
|
856
|
+
|
|
857
|
+
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
|
858
|
+
|
|
864
859
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
|
865
860
|
// in order to save memory and also to speed up the computation
|
|
866
861
|
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
|
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
882
877
|
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
|
883
878
|
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
|
884
879
|
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
|
|
880
|
+
fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
|
|
885
881
|
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
|
886
882
|
|
|
887
883
|
// print memory requirements
|
|
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1106
1102
|
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
|
1107
1103
|
}
|
|
1108
1104
|
|
|
1109
|
-
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*
|
|
1105
|
+
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
|
1110
1106
|
|
|
1111
1107
|
fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
1112
1108
|
}
|
|
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
|
|
|
1554
1550
|
Qcur),
|
|
1555
1551
|
Qcur);
|
|
1556
1552
|
|
|
1557
|
-
//Qcur =
|
|
1553
|
+
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1558
1554
|
|
|
1559
1555
|
// note: no bias for Key
|
|
1560
1556
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
|
1561
1557
|
layer.attn_k_w,
|
|
1562
1558
|
cur);
|
|
1563
1559
|
|
|
1564
|
-
//Kcur =
|
|
1560
|
+
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1565
1561
|
|
|
1566
1562
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
|
1567
1563
|
layer.attn_v_w,
|
|
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
|
|
|
1621
1617
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1622
1618
|
|
|
1623
1619
|
struct ggml_tensor * KQ_scaled =
|
|
1624
|
-
|
|
1620
|
+
ggml_scale_inplace(ctx0,
|
|
1625
1621
|
KQ,
|
|
1626
1622
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
1627
1623
|
);
|
|
1628
1624
|
|
|
1629
|
-
struct ggml_tensor * KQ_soft_max =
|
|
1625
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
|
|
1630
1626
|
|
|
1631
1627
|
struct ggml_tensor * V =
|
|
1632
1628
|
ggml_cpy(ctx0,
|
|
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
|
|
|
1809
1805
|
layer.cross_attn_k_w,
|
|
1810
1806
|
cur);
|
|
1811
1807
|
|
|
1812
|
-
Kcross =
|
|
1808
|
+
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
|
|
1813
1809
|
|
|
1814
1810
|
wstate.use_buf(ctx0, 1);
|
|
1815
1811
|
|
|
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
|
|
|
1956
1952
|
Qcur),
|
|
1957
1953
|
Qcur);
|
|
1958
1954
|
|
|
1959
|
-
Qcur =
|
|
1955
|
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1960
1956
|
|
|
1961
1957
|
// note: no bias for Key
|
|
1962
1958
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
|
1963
1959
|
layer.attn_k_w,
|
|
1964
1960
|
cur);
|
|
1965
1961
|
|
|
1966
|
-
Kcur =
|
|
1962
|
+
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1967
1963
|
|
|
1968
1964
|
// store key and value to memory
|
|
1969
1965
|
{
|
|
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
|
|
|
2012
2008
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
2013
2009
|
|
|
2014
2010
|
//struct ggml_tensor * KQ_scaled =
|
|
2015
|
-
//
|
|
2011
|
+
// ggml_scale_inplace(ctx0,
|
|
2016
2012
|
// KQ,
|
|
2017
2013
|
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2018
2014
|
// );
|
|
2019
2015
|
|
|
2020
|
-
struct ggml_tensor * KQ_masked =
|
|
2016
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
|
|
2021
2017
|
|
|
2022
|
-
struct ggml_tensor * KQ_soft_max =
|
|
2018
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
|
2023
2019
|
|
|
2024
2020
|
struct ggml_tensor * V =
|
|
2025
2021
|
ggml_view_3d(ctx0, kv_self.v,
|
|
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
|
|
|
2083
2079
|
Qcur),
|
|
2084
2080
|
Qcur);
|
|
2085
2081
|
|
|
2086
|
-
Qcur =
|
|
2082
|
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
2087
2083
|
|
|
2088
2084
|
// Kcross is already scaled
|
|
2089
2085
|
struct ggml_tensor * Kcross =
|
|
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
|
|
|
2123
2119
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
2124
2120
|
|
|
2125
2121
|
//struct ggml_tensor * KQ_scaled =
|
|
2126
|
-
//
|
|
2122
|
+
// ggml_scale_inplace(ctx0,
|
|
2127
2123
|
// KQ,
|
|
2128
2124
|
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2129
2125
|
// );
|
|
2130
2126
|
|
|
2131
2127
|
// no masking for cross-attention
|
|
2132
|
-
//struct ggml_tensor * KQ_masked =
|
|
2128
|
+
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
|
2133
2129
|
|
|
2134
|
-
struct ggml_tensor * KQ_soft_max =
|
|
2130
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
|
|
2135
2131
|
|
|
2136
2132
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
2137
2133
|
|
|
@@ -2602,6 +2598,15 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|
|
2602
2598
|
path_bin = path_bin.substr(0, pos);
|
|
2603
2599
|
}
|
|
2604
2600
|
|
|
2601
|
+
// match "-qx_x"
|
|
2602
|
+
pos = path_bin.rfind('-');
|
|
2603
|
+
if (pos != std::string::npos) {
|
|
2604
|
+
auto sub = path_bin.substr(pos);
|
|
2605
|
+
if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
|
|
2606
|
+
path_bin = path_bin.substr(0, pos);
|
|
2607
|
+
}
|
|
2608
|
+
}
|
|
2609
|
+
|
|
2605
2610
|
path_bin += "-encoder.mlmodelc";
|
|
2606
2611
|
|
|
2607
2612
|
return path_bin;
|
|
@@ -4903,7 +4908,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4903
4908
|
// b: N*N*sizeof(float)
|
|
4904
4909
|
// c: N*N*sizeof(float)
|
|
4905
4910
|
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
|
4906
|
-
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*
|
|
4911
|
+
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
|
|
4907
4912
|
|
|
4908
4913
|
// put a bunch of random data in the buffer
|
|
4909
4914
|
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -4911,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4911
4916
|
for (int j = 0; j < (int) sizes.size(); j++) {
|
|
4912
4917
|
int n_q4_0 = 0;
|
|
4913
4918
|
int n_q4_1 = 0;
|
|
4914
|
-
int n_q4_2 = 0;
|
|
4915
4919
|
int n_q5_0 = 0;
|
|
4916
4920
|
int n_q5_1 = 0;
|
|
4917
4921
|
int n_q8_0 = 0;
|
|
@@ -4921,7 +4925,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4921
4925
|
// GFLOPS/s
|
|
4922
4926
|
double s_q4_0 = 0.0;
|
|
4923
4927
|
double s_q4_1 = 0.0;
|
|
4924
|
-
double s_q4_2 = 0.0;
|
|
4925
4928
|
double s_q5_0 = 0.0;
|
|
4926
4929
|
double s_q5_1 = 0.0;
|
|
4927
4930
|
double s_q8_0 = 0.0;
|
|
@@ -4930,18 +4933,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4930
4933
|
|
|
4931
4934
|
const size_t N = sizes[j];
|
|
4932
4935
|
|
|
4933
|
-
for (int k = 0; k <
|
|
4936
|
+
for (int k = 0; k < 7; ++k) {
|
|
4934
4937
|
const ggml_type wtype =
|
|
4935
4938
|
k == 0 ? GGML_TYPE_Q4_0 :
|
|
4936
4939
|
k == 1 ? GGML_TYPE_Q4_1 :
|
|
4937
|
-
k == 2 ?
|
|
4938
|
-
k == 3 ?
|
|
4939
|
-
k == 4 ?
|
|
4940
|
-
k == 5 ?
|
|
4941
|
-
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
4940
|
+
k == 2 ? GGML_TYPE_Q5_0 :
|
|
4941
|
+
k == 3 ? GGML_TYPE_Q5_1 :
|
|
4942
|
+
k == 4 ? GGML_TYPE_Q8_0 :
|
|
4943
|
+
k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
4942
4944
|
|
|
4943
|
-
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ?
|
|
4944
|
-
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ?
|
|
4945
|
+
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
|
|
4946
|
+
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
|
|
4945
4947
|
|
|
4946
4948
|
struct ggml_init_params gparams = {
|
|
4947
4949
|
/*.mem_size =*/ buf.size(),
|
|
@@ -4985,9 +4987,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4985
4987
|
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
|
4986
4988
|
}
|
|
4987
4989
|
|
|
4988
|
-
// Q4_0 | Q4_1
|
|
4989
|
-
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)
|
|
4990
|
-
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1
|
|
4990
|
+
// Q4_0 | Q4_1
|
|
4991
|
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
|
|
4992
|
+
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
|
|
4991
4993
|
s += strbuf;
|
|
4992
4994
|
|
|
4993
4995
|
// Q5_0 | Q5_1 | Q8_0
|