whisper.rn 0.3.0-rc.1 → 0.3.0-rc.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/cpp/ggml.c +4727 -2321
- package/cpp/ggml.h +216 -13
- package/cpp/whisper.cpp +55 -39
- package/cpp/whisper.h +3 -0
- package/lib/commonjs/index.js +8 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -0
- package/lib/module/index.js +5 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -0
- package/lib/typescript/index.d.ts +4 -3
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +5 -2
- package/src/index.ts +6 -2
- package/src/version.json +1 -0
package/cpp/ggml.h
CHANGED
|
@@ -190,9 +190,12 @@
|
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
|
191
191
|
#define GGML_FILE_VERSION 1
|
|
192
192
|
|
|
193
|
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
|
195
|
+
|
|
193
196
|
#define GGML_MAX_DIMS 4
|
|
194
197
|
#define GGML_MAX_NODES 4096
|
|
195
|
-
#define GGML_MAX_PARAMS
|
|
198
|
+
#define GGML_MAX_PARAMS 256
|
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
|
197
200
|
#define GGML_MAX_OPT 4
|
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
|
243
246
|
GGML_TYPE_COUNT,
|
|
244
247
|
};
|
|
245
248
|
|
|
249
|
+
enum ggml_backend {
|
|
250
|
+
GGML_BACKEND_CPU = 0,
|
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
|
252
|
+
};
|
|
253
|
+
|
|
246
254
|
// model file types
|
|
247
255
|
enum ggml_ftype {
|
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
|
263
270
|
|
|
264
271
|
GGML_OP_DUP,
|
|
265
272
|
GGML_OP_ADD,
|
|
273
|
+
GGML_OP_ADD1,
|
|
274
|
+
GGML_OP_ACC,
|
|
266
275
|
GGML_OP_SUB,
|
|
267
276
|
GGML_OP_MUL,
|
|
268
277
|
GGML_OP_DIV,
|
|
269
278
|
GGML_OP_SQR,
|
|
270
279
|
GGML_OP_SQRT,
|
|
280
|
+
GGML_OP_LOG,
|
|
271
281
|
GGML_OP_SUM,
|
|
282
|
+
GGML_OP_SUM_ROWS,
|
|
272
283
|
GGML_OP_MEAN,
|
|
273
284
|
GGML_OP_REPEAT,
|
|
274
285
|
GGML_OP_ABS,
|
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
|
278
289
|
GGML_OP_RELU,
|
|
279
290
|
GGML_OP_GELU,
|
|
280
291
|
GGML_OP_SILU,
|
|
292
|
+
GGML_OP_SILU_BACK,
|
|
281
293
|
GGML_OP_NORM, // normalize
|
|
282
294
|
GGML_OP_RMS_NORM,
|
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
|
283
296
|
|
|
284
297
|
GGML_OP_MUL_MAT,
|
|
285
298
|
|
|
286
299
|
GGML_OP_SCALE,
|
|
300
|
+
GGML_OP_SET,
|
|
287
301
|
GGML_OP_CPY,
|
|
288
302
|
GGML_OP_CONT,
|
|
289
303
|
GGML_OP_RESHAPE,
|
|
@@ -291,10 +305,15 @@ extern "C" {
|
|
|
291
305
|
GGML_OP_PERMUTE,
|
|
292
306
|
GGML_OP_TRANSPOSE,
|
|
293
307
|
GGML_OP_GET_ROWS,
|
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
|
309
|
+
GGML_OP_DIAG,
|
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
|
295
312
|
GGML_OP_SOFT_MAX,
|
|
296
313
|
GGML_OP_ROPE,
|
|
314
|
+
GGML_OP_ROPE_BACK,
|
|
297
315
|
GGML_OP_ALIBI,
|
|
316
|
+
GGML_OP_CLAMP,
|
|
298
317
|
GGML_OP_CONV_1D_1S,
|
|
299
318
|
GGML_OP_CONV_1D_2S,
|
|
300
319
|
|
|
@@ -322,7 +341,8 @@ extern "C" {
|
|
|
322
341
|
|
|
323
342
|
// n-dimensional tensor
|
|
324
343
|
struct ggml_tensor {
|
|
325
|
-
enum ggml_type
|
|
344
|
+
enum ggml_type type;
|
|
345
|
+
enum ggml_backend backend;
|
|
326
346
|
|
|
327
347
|
int n_dims;
|
|
328
348
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
@@ -353,7 +373,7 @@ extern "C" {
|
|
|
353
373
|
|
|
354
374
|
char name[32];
|
|
355
375
|
|
|
356
|
-
char padding[
|
|
376
|
+
char padding[16];
|
|
357
377
|
};
|
|
358
378
|
|
|
359
379
|
// computation graph
|
|
@@ -497,6 +517,29 @@ extern "C" {
|
|
|
497
517
|
struct ggml_tensor * a,
|
|
498
518
|
struct ggml_tensor * b);
|
|
499
519
|
|
|
520
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
|
521
|
+
struct ggml_context * ctx,
|
|
522
|
+
struct ggml_tensor * a,
|
|
523
|
+
struct ggml_tensor * b);
|
|
524
|
+
|
|
525
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
|
526
|
+
struct ggml_context * ctx,
|
|
527
|
+
struct ggml_tensor * a,
|
|
528
|
+
struct ggml_tensor * b,
|
|
529
|
+
size_t nb1,
|
|
530
|
+
size_t nb2,
|
|
531
|
+
size_t nb3,
|
|
532
|
+
size_t offset);
|
|
533
|
+
|
|
534
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
|
535
|
+
struct ggml_context * ctx,
|
|
536
|
+
struct ggml_tensor * a,
|
|
537
|
+
struct ggml_tensor * b,
|
|
538
|
+
size_t nb1,
|
|
539
|
+
size_t nb2,
|
|
540
|
+
size_t nb3,
|
|
541
|
+
size_t offset);
|
|
542
|
+
|
|
500
543
|
GGML_API struct ggml_tensor * ggml_sub(
|
|
501
544
|
struct ggml_context * ctx,
|
|
502
545
|
struct ggml_tensor * a,
|
|
@@ -520,12 +563,24 @@ extern "C" {
|
|
|
520
563
|
struct ggml_context * ctx,
|
|
521
564
|
struct ggml_tensor * a);
|
|
522
565
|
|
|
566
|
+
GGML_API struct ggml_tensor * ggml_log(
|
|
567
|
+
struct ggml_context * ctx,
|
|
568
|
+
struct ggml_tensor * a);
|
|
569
|
+
|
|
570
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
|
571
|
+
struct ggml_context * ctx,
|
|
572
|
+
struct ggml_tensor * a);
|
|
573
|
+
|
|
523
574
|
// return scalar
|
|
524
|
-
// TODO: compute sum along rows
|
|
525
575
|
GGML_API struct ggml_tensor * ggml_sum(
|
|
526
576
|
struct ggml_context * ctx,
|
|
527
577
|
struct ggml_tensor * a);
|
|
528
578
|
|
|
579
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
|
580
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
|
581
|
+
struct ggml_context * ctx,
|
|
582
|
+
struct ggml_tensor * a);
|
|
583
|
+
|
|
529
584
|
// mean along rows
|
|
530
585
|
GGML_API struct ggml_tensor * ggml_mean(
|
|
531
586
|
struct ggml_context * ctx,
|
|
@@ -567,6 +622,13 @@ extern "C" {
|
|
|
567
622
|
struct ggml_context * ctx,
|
|
568
623
|
struct ggml_tensor * a);
|
|
569
624
|
|
|
625
|
+
// a - x
|
|
626
|
+
// b - dy
|
|
627
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
628
|
+
struct ggml_context * ctx,
|
|
629
|
+
struct ggml_tensor * a,
|
|
630
|
+
struct ggml_tensor * b);
|
|
631
|
+
|
|
570
632
|
// normalize along rows
|
|
571
633
|
// TODO: eps is hardcoded to 1e-5 for now
|
|
572
634
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
@@ -577,6 +639,13 @@ extern "C" {
|
|
|
577
639
|
struct ggml_context * ctx,
|
|
578
640
|
struct ggml_tensor * a);
|
|
579
641
|
|
|
642
|
+
// a - x
|
|
643
|
+
// b - dy
|
|
644
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
645
|
+
struct ggml_context * ctx,
|
|
646
|
+
struct ggml_tensor * a,
|
|
647
|
+
struct ggml_tensor * b);
|
|
648
|
+
|
|
580
649
|
// A: m rows, n columns
|
|
581
650
|
// B: p rows, n columns (i.e. we transpose it internally)
|
|
582
651
|
// result is m columns, p rows
|
|
@@ -589,12 +658,66 @@ extern "C" {
|
|
|
589
658
|
// operations on tensors without backpropagation
|
|
590
659
|
//
|
|
591
660
|
|
|
592
|
-
// in-place, returns view(a)
|
|
593
661
|
GGML_API struct ggml_tensor * ggml_scale(
|
|
594
662
|
struct ggml_context * ctx,
|
|
595
663
|
struct ggml_tensor * a,
|
|
596
664
|
struct ggml_tensor * b);
|
|
597
665
|
|
|
666
|
+
// in-place, returns view(a)
|
|
667
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
|
668
|
+
struct ggml_context * ctx,
|
|
669
|
+
struct ggml_tensor * a,
|
|
670
|
+
struct ggml_tensor * b);
|
|
671
|
+
|
|
672
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
673
|
+
GGML_API struct ggml_tensor * ggml_set(
|
|
674
|
+
struct ggml_context * ctx,
|
|
675
|
+
struct ggml_tensor * a,
|
|
676
|
+
struct ggml_tensor * b,
|
|
677
|
+
size_t nb1,
|
|
678
|
+
size_t nb2,
|
|
679
|
+
size_t nb3,
|
|
680
|
+
size_t offset);
|
|
681
|
+
|
|
682
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
|
683
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
|
684
|
+
struct ggml_context * ctx,
|
|
685
|
+
struct ggml_tensor * a,
|
|
686
|
+
struct ggml_tensor * b,
|
|
687
|
+
size_t nb1,
|
|
688
|
+
size_t nb2,
|
|
689
|
+
size_t nb3,
|
|
690
|
+
size_t offset);
|
|
691
|
+
|
|
692
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
|
693
|
+
struct ggml_context * ctx,
|
|
694
|
+
struct ggml_tensor * a,
|
|
695
|
+
struct ggml_tensor * b,
|
|
696
|
+
size_t offset);
|
|
697
|
+
|
|
698
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
|
699
|
+
struct ggml_context * ctx,
|
|
700
|
+
struct ggml_tensor * a,
|
|
701
|
+
struct ggml_tensor * b,
|
|
702
|
+
size_t offset);
|
|
703
|
+
|
|
704
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
705
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
|
706
|
+
struct ggml_context * ctx,
|
|
707
|
+
struct ggml_tensor * a,
|
|
708
|
+
struct ggml_tensor * b,
|
|
709
|
+
size_t nb1,
|
|
710
|
+
size_t offset);
|
|
711
|
+
|
|
712
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
|
713
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
|
714
|
+
struct ggml_context * ctx,
|
|
715
|
+
struct ggml_tensor * a,
|
|
716
|
+
struct ggml_tensor * b,
|
|
717
|
+
size_t nb1,
|
|
718
|
+
size_t offset);
|
|
719
|
+
|
|
720
|
+
|
|
598
721
|
// a -> b, return view(b)
|
|
599
722
|
GGML_API struct ggml_tensor * ggml_cpy(
|
|
600
723
|
struct ggml_context * ctx,
|
|
@@ -615,6 +738,11 @@ extern "C" {
|
|
|
615
738
|
|
|
616
739
|
// return view(a)
|
|
617
740
|
// TODO: when we start computing gradient, make a copy instead of view
|
|
741
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
|
742
|
+
struct ggml_context * ctx,
|
|
743
|
+
struct ggml_tensor * a,
|
|
744
|
+
int64_t ne0);
|
|
745
|
+
|
|
618
746
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
|
619
747
|
struct ggml_context * ctx,
|
|
620
748
|
struct ggml_tensor * a,
|
|
@@ -630,6 +758,14 @@ extern "C" {
|
|
|
630
758
|
int64_t ne1,
|
|
631
759
|
int64_t ne2);
|
|
632
760
|
|
|
761
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
|
762
|
+
struct ggml_context * ctx,
|
|
763
|
+
struct ggml_tensor * a,
|
|
764
|
+
int64_t ne0,
|
|
765
|
+
int64_t ne1,
|
|
766
|
+
int64_t ne2,
|
|
767
|
+
int64_t ne3);
|
|
768
|
+
|
|
633
769
|
// offset in bytes
|
|
634
770
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
|
635
771
|
struct ggml_context * ctx,
|
|
@@ -655,6 +791,18 @@ extern "C" {
|
|
|
655
791
|
size_t nb2, // slice stride in bytes
|
|
656
792
|
size_t offset);
|
|
657
793
|
|
|
794
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
|
795
|
+
struct ggml_context * ctx,
|
|
796
|
+
struct ggml_tensor * a,
|
|
797
|
+
int64_t ne0,
|
|
798
|
+
int64_t ne1,
|
|
799
|
+
int64_t ne2,
|
|
800
|
+
int64_t ne3,
|
|
801
|
+
size_t nb1, // row stride in bytes
|
|
802
|
+
size_t nb2, // slice stride in bytes
|
|
803
|
+
size_t nb3,
|
|
804
|
+
size_t offset);
|
|
805
|
+
|
|
658
806
|
GGML_API struct ggml_tensor * ggml_permute(
|
|
659
807
|
struct ggml_context * ctx,
|
|
660
808
|
struct ggml_tensor * a,
|
|
@@ -673,20 +821,50 @@ extern "C" {
|
|
|
673
821
|
struct ggml_tensor * a,
|
|
674
822
|
struct ggml_tensor * b);
|
|
675
823
|
|
|
824
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
|
825
|
+
struct ggml_context * ctx,
|
|
826
|
+
struct ggml_tensor * a,
|
|
827
|
+
struct ggml_tensor * b,
|
|
828
|
+
struct ggml_tensor * c);
|
|
829
|
+
|
|
830
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
|
831
|
+
struct ggml_context * ctx,
|
|
832
|
+
struct ggml_tensor * a);
|
|
833
|
+
|
|
676
834
|
// set elements above the diagonal to -INF
|
|
677
|
-
// in-place, returns view(a)
|
|
678
835
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
|
679
836
|
struct ggml_context * ctx,
|
|
680
837
|
struct ggml_tensor * a,
|
|
681
838
|
int n_past);
|
|
682
839
|
|
|
683
840
|
// in-place, returns view(a)
|
|
841
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
|
842
|
+
struct ggml_context * ctx,
|
|
843
|
+
struct ggml_tensor * a,
|
|
844
|
+
int n_past);
|
|
845
|
+
|
|
846
|
+
// set elements above the diagonal to 0
|
|
847
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
|
848
|
+
struct ggml_context * ctx,
|
|
849
|
+
struct ggml_tensor * a,
|
|
850
|
+
int n_past);
|
|
851
|
+
|
|
852
|
+
// in-place, returns view(a)
|
|
853
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
854
|
+
struct ggml_context * ctx,
|
|
855
|
+
struct ggml_tensor * a,
|
|
856
|
+
int n_past);
|
|
857
|
+
|
|
684
858
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
|
685
859
|
struct ggml_context * ctx,
|
|
686
860
|
struct ggml_tensor * a);
|
|
687
861
|
|
|
688
|
-
// rotary position embedding
|
|
689
862
|
// in-place, returns view(a)
|
|
863
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
|
864
|
+
struct ggml_context * ctx,
|
|
865
|
+
struct ggml_tensor * a);
|
|
866
|
+
|
|
867
|
+
// rotary position embedding
|
|
690
868
|
// if mode & 1 == 1, skip n_past elements
|
|
691
869
|
// if mode & 2 == 1, GPT-NeoX style
|
|
692
870
|
// TODO: avoid creating a new tensor every time
|
|
@@ -697,13 +875,39 @@ extern "C" {
|
|
|
697
875
|
int n_dims,
|
|
698
876
|
int mode);
|
|
699
877
|
|
|
878
|
+
// in-place, returns view(a)
|
|
879
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
|
880
|
+
struct ggml_context * ctx,
|
|
881
|
+
struct ggml_tensor * a,
|
|
882
|
+
int n_past,
|
|
883
|
+
int n_dims,
|
|
884
|
+
int mode);
|
|
885
|
+
|
|
886
|
+
// rotary position embedding backward, i.e compute dx from dy
|
|
887
|
+
// a - dy
|
|
888
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
|
889
|
+
struct ggml_context * ctx,
|
|
890
|
+
struct ggml_tensor * a,
|
|
891
|
+
int n_past,
|
|
892
|
+
int n_dims,
|
|
893
|
+
int mode);
|
|
894
|
+
|
|
700
895
|
// alibi position embedding
|
|
701
896
|
// in-place, returns view(a)
|
|
702
897
|
struct ggml_tensor * ggml_alibi(
|
|
703
898
|
struct ggml_context * ctx,
|
|
704
899
|
struct ggml_tensor * a,
|
|
705
900
|
int n_past,
|
|
706
|
-
int n_head
|
|
901
|
+
int n_head,
|
|
902
|
+
float bias_max);
|
|
903
|
+
|
|
904
|
+
// clamp
|
|
905
|
+
// in-place, returns view(a)
|
|
906
|
+
struct ggml_tensor * ggml_clamp(
|
|
907
|
+
struct ggml_context * ctx,
|
|
908
|
+
struct ggml_tensor * a,
|
|
909
|
+
float min,
|
|
910
|
+
float max);
|
|
707
911
|
|
|
708
912
|
// padding = 1
|
|
709
913
|
// TODO: we don't support extra parameters for now
|
|
@@ -741,13 +945,13 @@ extern "C" {
|
|
|
741
945
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
|
742
946
|
struct ggml_context * ctx,
|
|
743
947
|
struct ggml_tensor * a,
|
|
744
|
-
|
|
948
|
+
ggml_unary_op_f32_t fun);
|
|
745
949
|
|
|
746
950
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
|
747
951
|
struct ggml_context * ctx,
|
|
748
952
|
struct ggml_tensor * a,
|
|
749
953
|
struct ggml_tensor * b,
|
|
750
|
-
|
|
954
|
+
ggml_binary_op_f32_t fun);
|
|
751
955
|
|
|
752
956
|
//
|
|
753
957
|
// automatic differentiation
|
|
@@ -876,7 +1080,6 @@ extern "C" {
|
|
|
876
1080
|
|
|
877
1081
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
878
1082
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
880
1083
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
881
1084
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
882
1085
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
package/cpp/whisper.cpp
CHANGED
|
@@ -139,7 +139,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
|
|
139
139
|
{ "hi", { 17, "hindi", } },
|
|
140
140
|
{ "fi", { 18, "finnish", } },
|
|
141
141
|
{ "vi", { 19, "vietnamese", } },
|
|
142
|
-
{ "
|
|
142
|
+
{ "he", { 20, "hebrew", } },
|
|
143
143
|
{ "uk", { 21, "ukrainian", } },
|
|
144
144
|
{ "el", { 22, "greek", } },
|
|
145
145
|
{ "ms", { 23, "malay", } },
|
|
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
|
|
291
291
|
{ MODEL_LARGE, 1124ull*MB },
|
|
292
292
|
},
|
|
293
293
|
},
|
|
294
|
-
{ GGML_TYPE_Q4_2,
|
|
295
|
-
{
|
|
296
|
-
{ MODEL_TINY, 26ull*MB },
|
|
297
|
-
{ MODEL_BASE, 50ull*MB },
|
|
298
|
-
{ MODEL_SMALL, 154ull*MB },
|
|
299
|
-
{ MODEL_MEDIUM, 470ull*MB },
|
|
300
|
-
{ MODEL_LARGE, 940ull*MB },
|
|
301
|
-
},
|
|
302
|
-
},
|
|
303
294
|
{ GGML_TYPE_Q5_0,
|
|
304
295
|
{
|
|
305
296
|
{ MODEL_TINY, 30ull*MB },
|
|
@@ -861,6 +852,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
861
852
|
model.type = e_model::MODEL_LARGE;
|
|
862
853
|
}
|
|
863
854
|
|
|
855
|
+
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
|
856
|
+
|
|
857
|
+
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
|
858
|
+
|
|
864
859
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
|
865
860
|
// in order to save memory and also to speed up the computation
|
|
866
861
|
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
|
@@ -882,6 +877,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
882
877
|
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
|
883
878
|
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
|
884
879
|
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
|
|
880
|
+
fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
|
|
885
881
|
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
|
886
882
|
|
|
887
883
|
// print memory requirements
|
|
@@ -1106,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1106
1102
|
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
|
1107
1103
|
}
|
|
1108
1104
|
|
|
1109
|
-
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*
|
|
1105
|
+
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
|
1110
1106
|
|
|
1111
1107
|
fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
1112
1108
|
}
|
|
@@ -1554,14 +1550,14 @@ static bool whisper_encode_internal(
|
|
|
1554
1550
|
Qcur),
|
|
1555
1551
|
Qcur);
|
|
1556
1552
|
|
|
1557
|
-
//Qcur =
|
|
1553
|
+
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1558
1554
|
|
|
1559
1555
|
// note: no bias for Key
|
|
1560
1556
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
|
1561
1557
|
layer.attn_k_w,
|
|
1562
1558
|
cur);
|
|
1563
1559
|
|
|
1564
|
-
//Kcur =
|
|
1560
|
+
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1565
1561
|
|
|
1566
1562
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
|
1567
1563
|
layer.attn_v_w,
|
|
@@ -1621,12 +1617,12 @@ static bool whisper_encode_internal(
|
|
|
1621
1617
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1622
1618
|
|
|
1623
1619
|
struct ggml_tensor * KQ_scaled =
|
|
1624
|
-
|
|
1620
|
+
ggml_scale_inplace(ctx0,
|
|
1625
1621
|
KQ,
|
|
1626
1622
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
1627
1623
|
);
|
|
1628
1624
|
|
|
1629
|
-
struct ggml_tensor * KQ_soft_max =
|
|
1625
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
|
|
1630
1626
|
|
|
1631
1627
|
struct ggml_tensor * V =
|
|
1632
1628
|
ggml_cpy(ctx0,
|
|
@@ -1809,7 +1805,7 @@ static bool whisper_encode_internal(
|
|
|
1809
1805
|
layer.cross_attn_k_w,
|
|
1810
1806
|
cur);
|
|
1811
1807
|
|
|
1812
|
-
Kcross =
|
|
1808
|
+
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
|
|
1813
1809
|
|
|
1814
1810
|
wstate.use_buf(ctx0, 1);
|
|
1815
1811
|
|
|
@@ -1956,14 +1952,14 @@ static bool whisper_decode_internal(
|
|
|
1956
1952
|
Qcur),
|
|
1957
1953
|
Qcur);
|
|
1958
1954
|
|
|
1959
|
-
Qcur =
|
|
1955
|
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1960
1956
|
|
|
1961
1957
|
// note: no bias for Key
|
|
1962
1958
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
|
1963
1959
|
layer.attn_k_w,
|
|
1964
1960
|
cur);
|
|
1965
1961
|
|
|
1966
|
-
Kcur =
|
|
1962
|
+
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
1967
1963
|
|
|
1968
1964
|
// store key and value to memory
|
|
1969
1965
|
{
|
|
@@ -2012,14 +2008,14 @@ static bool whisper_decode_internal(
|
|
|
2012
2008
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
2013
2009
|
|
|
2014
2010
|
//struct ggml_tensor * KQ_scaled =
|
|
2015
|
-
//
|
|
2011
|
+
// ggml_scale_inplace(ctx0,
|
|
2016
2012
|
// KQ,
|
|
2017
2013
|
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2018
2014
|
// );
|
|
2019
2015
|
|
|
2020
|
-
struct ggml_tensor * KQ_masked =
|
|
2016
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
|
|
2021
2017
|
|
|
2022
|
-
struct ggml_tensor * KQ_soft_max =
|
|
2018
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
|
2023
2019
|
|
|
2024
2020
|
struct ggml_tensor * V =
|
|
2025
2021
|
ggml_view_3d(ctx0, kv_self.v,
|
|
@@ -2083,7 +2079,7 @@ static bool whisper_decode_internal(
|
|
|
2083
2079
|
Qcur),
|
|
2084
2080
|
Qcur);
|
|
2085
2081
|
|
|
2086
|
-
Qcur =
|
|
2082
|
+
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
|
2087
2083
|
|
|
2088
2084
|
// Kcross is already scaled
|
|
2089
2085
|
struct ggml_tensor * Kcross =
|
|
@@ -2123,15 +2119,15 @@ static bool whisper_decode_internal(
|
|
|
2123
2119
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
2124
2120
|
|
|
2125
2121
|
//struct ggml_tensor * KQ_scaled =
|
|
2126
|
-
//
|
|
2122
|
+
// ggml_scale_inplace(ctx0,
|
|
2127
2123
|
// KQ,
|
|
2128
2124
|
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
|
2129
2125
|
// );
|
|
2130
2126
|
|
|
2131
2127
|
// no masking for cross-attention
|
|
2132
|
-
//struct ggml_tensor * KQ_masked =
|
|
2128
|
+
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
|
2133
2129
|
|
|
2134
|
-
struct ggml_tensor * KQ_soft_max =
|
|
2130
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
|
|
2135
2131
|
|
|
2136
2132
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
2137
2133
|
|
|
@@ -2602,6 +2598,15 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|
|
2602
2598
|
path_bin = path_bin.substr(0, pos);
|
|
2603
2599
|
}
|
|
2604
2600
|
|
|
2601
|
+
// match "-qx_x"
|
|
2602
|
+
pos = path_bin.rfind('-');
|
|
2603
|
+
if (pos != std::string::npos) {
|
|
2604
|
+
auto sub = path_bin.substr(pos);
|
|
2605
|
+
if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
|
|
2606
|
+
path_bin = path_bin.substr(0, pos);
|
|
2607
|
+
}
|
|
2608
|
+
}
|
|
2609
|
+
|
|
2605
2610
|
path_bin += "-encoder.mlmodelc";
|
|
2606
2611
|
|
|
2607
2612
|
return path_bin;
|
|
@@ -2847,6 +2852,12 @@ void whisper_free(struct whisper_context * ctx) {
|
|
|
2847
2852
|
}
|
|
2848
2853
|
}
|
|
2849
2854
|
|
|
2855
|
+
void whisper_free_params(struct whisper_full_params * params) {
|
|
2856
|
+
if (params) {
|
|
2857
|
+
delete params;
|
|
2858
|
+
}
|
|
2859
|
+
}
|
|
2860
|
+
|
|
2850
2861
|
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
|
2851
2862
|
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
|
|
2852
2863
|
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
|
@@ -3280,6 +3291,14 @@ const char * whisper_print_system_info(void) {
|
|
|
3280
3291
|
|
|
3281
3292
|
////////////////////////////////////////////////////////////////////////////
|
|
3282
3293
|
|
|
3294
|
+
struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
|
|
3295
|
+
struct whisper_full_params params = whisper_full_default_params(strategy);
|
|
3296
|
+
|
|
3297
|
+
struct whisper_full_params* result = new whisper_full_params();
|
|
3298
|
+
*result = params;
|
|
3299
|
+
return result;
|
|
3300
|
+
}
|
|
3301
|
+
|
|
3283
3302
|
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
|
3284
3303
|
struct whisper_full_params result = {
|
|
3285
3304
|
/*.strategy =*/ strategy,
|
|
@@ -4903,7 +4922,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4903
4922
|
// b: N*N*sizeof(float)
|
|
4904
4923
|
// c: N*N*sizeof(float)
|
|
4905
4924
|
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
|
4906
|
-
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*
|
|
4925
|
+
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
|
|
4907
4926
|
|
|
4908
4927
|
// put a bunch of random data in the buffer
|
|
4909
4928
|
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -4911,7 +4930,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4911
4930
|
for (int j = 0; j < (int) sizes.size(); j++) {
|
|
4912
4931
|
int n_q4_0 = 0;
|
|
4913
4932
|
int n_q4_1 = 0;
|
|
4914
|
-
int n_q4_2 = 0;
|
|
4915
4933
|
int n_q5_0 = 0;
|
|
4916
4934
|
int n_q5_1 = 0;
|
|
4917
4935
|
int n_q8_0 = 0;
|
|
@@ -4921,7 +4939,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4921
4939
|
// GFLOPS/s
|
|
4922
4940
|
double s_q4_0 = 0.0;
|
|
4923
4941
|
double s_q4_1 = 0.0;
|
|
4924
|
-
double s_q4_2 = 0.0;
|
|
4925
4942
|
double s_q5_0 = 0.0;
|
|
4926
4943
|
double s_q5_1 = 0.0;
|
|
4927
4944
|
double s_q8_0 = 0.0;
|
|
@@ -4930,18 +4947,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4930
4947
|
|
|
4931
4948
|
const size_t N = sizes[j];
|
|
4932
4949
|
|
|
4933
|
-
for (int k = 0; k <
|
|
4950
|
+
for (int k = 0; k < 7; ++k) {
|
|
4934
4951
|
const ggml_type wtype =
|
|
4935
4952
|
k == 0 ? GGML_TYPE_Q4_0 :
|
|
4936
4953
|
k == 1 ? GGML_TYPE_Q4_1 :
|
|
4937
|
-
k == 2 ?
|
|
4938
|
-
k == 3 ?
|
|
4939
|
-
k == 4 ?
|
|
4940
|
-
k == 5 ?
|
|
4941
|
-
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
4954
|
+
k == 2 ? GGML_TYPE_Q5_0 :
|
|
4955
|
+
k == 3 ? GGML_TYPE_Q5_1 :
|
|
4956
|
+
k == 4 ? GGML_TYPE_Q8_0 :
|
|
4957
|
+
k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
4942
4958
|
|
|
4943
|
-
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ?
|
|
4944
|
-
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ?
|
|
4959
|
+
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
|
|
4960
|
+
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
|
|
4945
4961
|
|
|
4946
4962
|
struct ggml_init_params gparams = {
|
|
4947
4963
|
/*.mem_size =*/ buf.size(),
|
|
@@ -4985,9 +5001,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
4985
5001
|
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
|
4986
5002
|
}
|
|
4987
5003
|
|
|
4988
|
-
// Q4_0 | Q4_1
|
|
4989
|
-
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)
|
|
4990
|
-
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1
|
|
5004
|
+
// Q4_0 | Q4_1
|
|
5005
|
+
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
|
|
5006
|
+
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
|
|
4991
5007
|
s += strbuf;
|
|
4992
5008
|
|
|
4993
5009
|
// Q5_0 | Q5_1 | Q8_0
|