whisper.rn 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +37 -6
- package/android/src/main/jni/whisper/jni.cpp +22 -1
- package/cpp/rn-whisper.cpp +7 -0
- package/cpp/rn-whisper.h +1 -0
- package/cpp/whisper.cpp +114 -102
- package/cpp/whisper.h +6 -0
- package/ios/RNWhisper.mm +46 -15
- package/ios/RNWhisperContext.h +4 -2
- package/ios/RNWhisperContext.mm +51 -13
- package/jest/mock.js +1 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +42 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +42 -2
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +3 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +12 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -2
- package/src/NativeRNWhisper.ts +3 -0
- package/src/index.ts +57 -3
package/cpp/whisper.cpp
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
#define _USE_MATH_DEFINES
|
|
15
15
|
#include <cmath>
|
|
16
16
|
#include <cstdio>
|
|
17
|
+
#include <cstdarg>
|
|
17
18
|
#include <cstring>
|
|
18
19
|
#include <fstream>
|
|
19
20
|
#include <map>
|
|
@@ -92,7 +93,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
|
|
92
93
|
#define WHISPER_ASSERT(x) \
|
|
93
94
|
do { \
|
|
94
95
|
if (!(x)) { \
|
|
95
|
-
|
|
96
|
+
log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
96
97
|
abort(); \
|
|
97
98
|
} \
|
|
98
99
|
} while (0)
|
|
@@ -723,6 +724,21 @@ struct whisper_context {
|
|
|
723
724
|
std::string path_model; // populated by whisper_init_from_file()
|
|
724
725
|
};
|
|
725
726
|
|
|
727
|
+
static void whisper_default_log(const char * text) {
|
|
728
|
+
fprintf(stderr, "%s", text);
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
static whisper_log_callback whisper_log = whisper_default_log;
|
|
732
|
+
|
|
733
|
+
static void log(const char * fmt, ...) {
|
|
734
|
+
if (!whisper_log) return;
|
|
735
|
+
char buf[1024];
|
|
736
|
+
va_list args;
|
|
737
|
+
va_start(args, fmt);
|
|
738
|
+
vsnprintf(buf, sizeof(buf), fmt, args);
|
|
739
|
+
whisper_log(buf);
|
|
740
|
+
}
|
|
741
|
+
|
|
726
742
|
template<typename T>
|
|
727
743
|
static void read_safe(whisper_model_loader * loader, T & dest) {
|
|
728
744
|
loader->read(loader->context, &dest, sizeof(T));
|
|
@@ -746,7 +762,7 @@ static bool kv_cache_init(
|
|
|
746
762
|
cache.ctx = ggml_init(params);
|
|
747
763
|
|
|
748
764
|
if (!cache.ctx) {
|
|
749
|
-
|
|
765
|
+
log("%s: failed to allocate memory for kv cache\n", __func__);
|
|
750
766
|
return false;
|
|
751
767
|
}
|
|
752
768
|
|
|
@@ -782,7 +798,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|
|
782
798
|
cache.ctx = ggml_init(params);
|
|
783
799
|
|
|
784
800
|
if (!cache.ctx) {
|
|
785
|
-
|
|
801
|
+
log("%s: failed to allocate memory for kv cache\n", __func__);
|
|
786
802
|
return false;
|
|
787
803
|
}
|
|
788
804
|
|
|
@@ -811,7 +827,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
|
|
|
811
827
|
// see the convert-pt-to-ggml.py script for details
|
|
812
828
|
//
|
|
813
829
|
static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
|
|
814
|
-
|
|
830
|
+
log("%s: loading model\n", __func__);
|
|
815
831
|
|
|
816
832
|
const int64_t t_start_us = ggml_time_us();
|
|
817
833
|
|
|
@@ -825,7 +841,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
825
841
|
uint32_t magic;
|
|
826
842
|
read_safe(loader, magic);
|
|
827
843
|
if (magic != GGML_FILE_MAGIC) {
|
|
828
|
-
|
|
844
|
+
log("%s: invalid model data (bad magic)\n", __func__);
|
|
829
845
|
return false;
|
|
830
846
|
}
|
|
831
847
|
}
|
|
@@ -876,25 +892,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
876
892
|
// in order to save memory and also to speed up the computation
|
|
877
893
|
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
|
878
894
|
if (wctx.wtype == GGML_TYPE_COUNT) {
|
|
879
|
-
|
|
895
|
+
log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
|
880
896
|
return false;
|
|
881
897
|
}
|
|
882
898
|
|
|
883
899
|
const size_t scale = model.hparams.ftype ? 1 : 2;
|
|
884
900
|
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
901
|
+
log("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
|
902
|
+
log("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
|
903
|
+
log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
|
|
904
|
+
log("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
|
|
905
|
+
log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
|
|
906
|
+
log("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
|
|
907
|
+
log("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
|
|
908
|
+
log("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
|
909
|
+
log("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
|
910
|
+
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
|
|
911
|
+
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
|
|
912
|
+
log("%s: qntvr = %d\n", __func__, qntvr);
|
|
913
|
+
log("%s: type = %d\n", __func__, model.type);
|
|
898
914
|
|
|
899
915
|
// print memory requirements
|
|
900
916
|
{
|
|
@@ -912,7 +928,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
912
928
|
const size_t mem_required_decoder =
|
|
913
929
|
scale*MEM_REQ_KV_SELF.at(model.type);
|
|
914
930
|
|
|
915
|
-
|
|
931
|
+
log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
|
|
916
932
|
mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
|
|
917
933
|
}
|
|
918
934
|
|
|
@@ -944,7 +960,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
944
960
|
read_safe(loader, n_vocab);
|
|
945
961
|
|
|
946
962
|
//if (n_vocab != model.hparams.n_vocab) {
|
|
947
|
-
//
|
|
963
|
+
// log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
|
948
964
|
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
|
949
965
|
// return false;
|
|
950
966
|
//}
|
|
@@ -964,7 +980,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
964
980
|
word.assign(&tmp[0], tmp.size());
|
|
965
981
|
} else {
|
|
966
982
|
// seems like we have an empty-string token in multi-language models (i = 50256)
|
|
967
|
-
//
|
|
983
|
+
//log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
|
|
968
984
|
word = "";
|
|
969
985
|
}
|
|
970
986
|
|
|
@@ -988,7 +1004,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
988
1004
|
}
|
|
989
1005
|
|
|
990
1006
|
if (n_vocab < model.hparams.n_vocab) {
|
|
991
|
-
|
|
1007
|
+
log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
|
|
992
1008
|
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
|
|
993
1009
|
if (i > vocab.token_beg) {
|
|
994
1010
|
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
|
|
@@ -1127,7 +1143,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1127
1143
|
|
|
1128
1144
|
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
|
|
1129
1145
|
|
|
1130
|
-
|
|
1146
|
+
log("%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
1131
1147
|
}
|
|
1132
1148
|
|
|
1133
1149
|
// create the ggml context
|
|
@@ -1140,7 +1156,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1140
1156
|
|
|
1141
1157
|
model.ctx = ggml_init(params);
|
|
1142
1158
|
if (!model.ctx) {
|
|
1143
|
-
|
|
1159
|
+
log("%s: ggml_init() failed\n", __func__);
|
|
1144
1160
|
return false;
|
|
1145
1161
|
}
|
|
1146
1162
|
}
|
|
@@ -1373,20 +1389,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1373
1389
|
name.assign(&tmp[0], tmp.size());
|
|
1374
1390
|
|
|
1375
1391
|
if (model.tensors.find(name) == model.tensors.end()) {
|
|
1376
|
-
|
|
1392
|
+
log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
|
1377
1393
|
return false;
|
|
1378
1394
|
}
|
|
1379
1395
|
|
|
1380
1396
|
auto tensor = model.tensors[name.data()];
|
|
1381
1397
|
if (ggml_nelements(tensor) != nelements) {
|
|
1382
|
-
|
|
1383
|
-
|
|
1398
|
+
log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
|
1399
|
+
log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
|
|
1384
1400
|
__func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
|
|
1385
1401
|
return false;
|
|
1386
1402
|
}
|
|
1387
1403
|
|
|
1388
1404
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
|
1389
|
-
|
|
1405
|
+
log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
|
|
1390
1406
|
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
|
|
1391
1407
|
return false;
|
|
1392
1408
|
}
|
|
@@ -1394,7 +1410,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1394
1410
|
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
|
1395
1411
|
|
|
1396
1412
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
|
1397
|
-
|
|
1413
|
+
log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
|
1398
1414
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
|
1399
1415
|
return false;
|
|
1400
1416
|
}
|
|
@@ -1407,12 +1423,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
1407
1423
|
model.n_loaded++;
|
|
1408
1424
|
}
|
|
1409
1425
|
|
|
1410
|
-
|
|
1426
|
+
log("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
|
|
1411
1427
|
|
|
1412
1428
|
if (model.n_loaded == 0) {
|
|
1413
|
-
|
|
1429
|
+
log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
|
1414
1430
|
} else if (model.n_loaded != (int) model.tensors.size()) {
|
|
1415
|
-
|
|
1431
|
+
log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
|
1416
1432
|
return false;
|
|
1417
1433
|
}
|
|
1418
1434
|
}
|
|
@@ -2616,7 +2632,7 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
|
|
2616
2632
|
--j;
|
|
2617
2633
|
}
|
|
2618
2634
|
if (!found) {
|
|
2619
|
-
|
|
2635
|
+
log("unknown token\n");
|
|
2620
2636
|
++i;
|
|
2621
2637
|
}
|
|
2622
2638
|
}
|
|
@@ -2683,41 +2699,41 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
|
2683
2699
|
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
|
|
2684
2700
|
|
|
2685
2701
|
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
|
|
2686
|
-
|
|
2702
|
+
log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
|
2687
2703
|
delete state;
|
|
2688
2704
|
return nullptr;
|
|
2689
2705
|
}
|
|
2690
2706
|
|
|
2691
2707
|
{
|
|
2692
2708
|
const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
|
|
2693
|
-
|
|
2709
|
+
log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
|
2694
2710
|
}
|
|
2695
2711
|
|
|
2696
2712
|
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
|
2697
|
-
|
|
2713
|
+
log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
|
2698
2714
|
delete state;
|
|
2699
2715
|
return nullptr;
|
|
2700
2716
|
}
|
|
2701
2717
|
|
|
2702
2718
|
{
|
|
2703
2719
|
const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
|
|
2704
|
-
|
|
2720
|
+
log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
|
2705
2721
|
}
|
|
2706
2722
|
|
|
2707
2723
|
#ifdef WHISPER_USE_COREML
|
|
2708
2724
|
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
|
|
2709
2725
|
|
|
2710
|
-
|
|
2711
|
-
|
|
2726
|
+
log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
|
2727
|
+
log("%s: first run on a device may take a while ...\n", __func__);
|
|
2712
2728
|
|
|
2713
2729
|
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
|
2714
2730
|
if (!state->ctx_coreml) {
|
|
2715
|
-
|
|
2731
|
+
log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
|
2716
2732
|
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
|
2717
2733
|
return nullptr;
|
|
2718
2734
|
#endif
|
|
2719
2735
|
} else {
|
|
2720
|
-
|
|
2736
|
+
log("%s: Core ML model loaded\n", __func__);
|
|
2721
2737
|
}
|
|
2722
2738
|
#endif
|
|
2723
2739
|
|
|
@@ -2757,7 +2773,7 @@ int whisper_ctx_init_openvino_encoder(
|
|
|
2757
2773
|
return 1;
|
|
2758
2774
|
#else
|
|
2759
2775
|
if (!model_path && ctx->path_model.empty()) {
|
|
2760
|
-
|
|
2776
|
+
log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
|
|
2761
2777
|
return 1;
|
|
2762
2778
|
}
|
|
2763
2779
|
|
|
@@ -2777,15 +2793,15 @@ int whisper_ctx_init_openvino_encoder(
|
|
|
2777
2793
|
path_cache = cache_dir;
|
|
2778
2794
|
}
|
|
2779
2795
|
|
|
2780
|
-
|
|
2781
|
-
|
|
2796
|
+
log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
|
|
2797
|
+
log("%s: first run on a device may take a while ...\n", __func__);
|
|
2782
2798
|
|
|
2783
2799
|
ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
|
|
2784
2800
|
if (!ctx->state->ctx_openvino) {
|
|
2785
|
-
|
|
2801
|
+
log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
|
|
2786
2802
|
return 1;
|
|
2787
2803
|
} else {
|
|
2788
|
-
|
|
2804
|
+
log("%s: OpenVINO model loaded\n", __func__);
|
|
2789
2805
|
}
|
|
2790
2806
|
|
|
2791
2807
|
return 0;
|
|
@@ -2794,11 +2810,11 @@ int whisper_ctx_init_openvino_encoder(
|
|
|
2794
2810
|
|
|
2795
2811
|
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
|
2796
2812
|
|
|
2797
|
-
|
|
2813
|
+
log("%s: loading model from '%s'\n", __func__, path_model);
|
|
2798
2814
|
|
|
2799
2815
|
auto fin = std::ifstream(path_model, std::ios::binary);
|
|
2800
2816
|
if (!fin) {
|
|
2801
|
-
|
|
2817
|
+
log("%s: failed to open '%s'\n", __func__, path_model);
|
|
2802
2818
|
return nullptr;
|
|
2803
2819
|
}
|
|
2804
2820
|
|
|
@@ -2840,7 +2856,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
|
|
|
2840
2856
|
|
|
2841
2857
|
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
|
|
2842
2858
|
|
|
2843
|
-
|
|
2859
|
+
log("%s: loading model from buffer\n", __func__);
|
|
2844
2860
|
|
|
2845
2861
|
whisper_model_loader loader = {};
|
|
2846
2862
|
|
|
@@ -2875,7 +2891,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
|
|
|
2875
2891
|
|
|
2876
2892
|
if (!whisper_model_load(loader, *ctx)) {
|
|
2877
2893
|
loader->close(loader->context);
|
|
2878
|
-
|
|
2894
|
+
log("%s: failed to load model\n", __func__);
|
|
2879
2895
|
delete ctx;
|
|
2880
2896
|
return nullptr;
|
|
2881
2897
|
}
|
|
@@ -2980,7 +2996,7 @@ void whisper_free_params(struct whisper_full_params * params) {
|
|
|
2980
2996
|
|
|
2981
2997
|
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
|
2982
2998
|
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
|
|
2983
|
-
|
|
2999
|
+
log("%s: failed to compute mel spectrogram\n", __func__);
|
|
2984
3000
|
return -1;
|
|
2985
3001
|
}
|
|
2986
3002
|
|
|
@@ -2994,7 +3010,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
|
|
|
2994
3010
|
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
|
2995
3011
|
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
|
2996
3012
|
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
|
|
2997
|
-
|
|
3013
|
+
log("%s: failed to compute mel spectrogram\n", __func__);
|
|
2998
3014
|
return -1;
|
|
2999
3015
|
}
|
|
3000
3016
|
|
|
@@ -3013,7 +3029,7 @@ int whisper_set_mel_with_state(
|
|
|
3013
3029
|
int n_len,
|
|
3014
3030
|
int n_mel) {
|
|
3015
3031
|
if (n_mel != WHISPER_N_MEL) {
|
|
3016
|
-
|
|
3032
|
+
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
|
|
3017
3033
|
return -1;
|
|
3018
3034
|
}
|
|
3019
3035
|
|
|
@@ -3037,7 +3053,7 @@ int whisper_set_mel(
|
|
|
3037
3053
|
|
|
3038
3054
|
int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
|
|
3039
3055
|
if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
|
|
3040
|
-
|
|
3056
|
+
log("%s: failed to eval\n", __func__);
|
|
3041
3057
|
return -1;
|
|
3042
3058
|
}
|
|
3043
3059
|
|
|
@@ -3046,7 +3062,7 @@ int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state
|
|
|
3046
3062
|
|
|
3047
3063
|
int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
|
|
3048
3064
|
if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
|
|
3049
|
-
|
|
3065
|
+
log("%s: failed to eval\n", __func__);
|
|
3050
3066
|
return -1;
|
|
3051
3067
|
}
|
|
3052
3068
|
|
|
@@ -3057,7 +3073,7 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
|
|
|
3057
3073
|
const int selected_decoder_id = 0;
|
|
3058
3074
|
|
|
3059
3075
|
if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
|
3060
|
-
|
|
3076
|
+
log("%s: failed to eval\n", __func__);
|
|
3061
3077
|
return 1;
|
|
3062
3078
|
}
|
|
3063
3079
|
|
|
@@ -3069,13 +3085,13 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
|
|
|
3069
3085
|
const int selected_decoder_id = 0;
|
|
3070
3086
|
|
|
3071
3087
|
if (ctx->state == nullptr) {
|
|
3072
|
-
|
|
3088
|
+
log("%s: ERROR state was not loaded.\n", __func__);
|
|
3073
3089
|
return false;
|
|
3074
3090
|
}
|
|
3075
3091
|
|
|
3076
3092
|
|
|
3077
3093
|
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
|
3078
|
-
|
|
3094
|
+
log("%s: failed to eval\n", __func__);
|
|
3079
3095
|
return 1;
|
|
3080
3096
|
}
|
|
3081
3097
|
|
|
@@ -3086,7 +3102,7 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
|
|
|
3086
3102
|
const auto res = tokenize(ctx->vocab, text);
|
|
3087
3103
|
|
|
3088
3104
|
if (n_max_tokens < (int) res.size()) {
|
|
3089
|
-
|
|
3105
|
+
log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
|
|
3090
3106
|
return -1;
|
|
3091
3107
|
}
|
|
3092
3108
|
|
|
@@ -3114,7 +3130,7 @@ int whisper_lang_id(const char * lang) {
|
|
|
3114
3130
|
}
|
|
3115
3131
|
}
|
|
3116
3132
|
|
|
3117
|
-
|
|
3133
|
+
log("%s: unknown language '%s'\n", __func__, lang);
|
|
3118
3134
|
return -1;
|
|
3119
3135
|
}
|
|
3120
3136
|
return g_lang.at(lang).first;
|
|
@@ -3127,7 +3143,7 @@ const char * whisper_lang_str(int id) {
|
|
|
3127
3143
|
}
|
|
3128
3144
|
}
|
|
3129
3145
|
|
|
3130
|
-
|
|
3146
|
+
log("%s: unknown language id %d\n", __func__, id);
|
|
3131
3147
|
return nullptr;
|
|
3132
3148
|
}
|
|
3133
3149
|
|
|
@@ -3140,25 +3156,25 @@ int whisper_lang_auto_detect_with_state(
|
|
|
3140
3156
|
const int seek = offset_ms/10;
|
|
3141
3157
|
|
|
3142
3158
|
if (seek < 0) {
|
|
3143
|
-
|
|
3159
|
+
log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
|
|
3144
3160
|
return -1;
|
|
3145
3161
|
}
|
|
3146
3162
|
|
|
3147
3163
|
if (seek >= state->mel.n_len_org) {
|
|
3148
|
-
|
|
3164
|
+
log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
|
|
3149
3165
|
return -2;
|
|
3150
3166
|
}
|
|
3151
3167
|
|
|
3152
3168
|
// run the encoder
|
|
3153
3169
|
if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
|
|
3154
|
-
|
|
3170
|
+
log("%s: failed to encode\n", __func__);
|
|
3155
3171
|
return -6;
|
|
3156
3172
|
}
|
|
3157
3173
|
|
|
3158
3174
|
const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
|
|
3159
3175
|
|
|
3160
3176
|
if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
|
|
3161
|
-
|
|
3177
|
+
log("%s: failed to decode\n", __func__);
|
|
3162
3178
|
return -7;
|
|
3163
3179
|
}
|
|
3164
3180
|
|
|
@@ -3359,21 +3375,21 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
|
|
|
3359
3375
|
void whisper_print_timings(struct whisper_context * ctx) {
|
|
3360
3376
|
const int64_t t_end_us = ggml_time_us();
|
|
3361
3377
|
|
|
3362
|
-
|
|
3363
|
-
|
|
3378
|
+
log("\n");
|
|
3379
|
+
log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
|
3364
3380
|
if (ctx->state != nullptr) {
|
|
3365
3381
|
|
|
3366
3382
|
const int32_t n_sample = std::max(1, ctx->state->n_sample);
|
|
3367
3383
|
const int32_t n_encode = std::max(1, ctx->state->n_encode);
|
|
3368
3384
|
const int32_t n_decode = std::max(1, ctx->state->n_decode);
|
|
3369
3385
|
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3386
|
+
log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
|
|
3387
|
+
log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
|
|
3388
|
+
log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
|
|
3389
|
+
log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
|
|
3390
|
+
log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
|
|
3375
3391
|
}
|
|
3376
|
-
|
|
3392
|
+
log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
|
3377
3393
|
}
|
|
3378
3394
|
|
|
3379
3395
|
void whisper_reset_timings(struct whisper_context * ctx) {
|
|
@@ -3697,7 +3713,7 @@ static void whisper_process_logits(
|
|
|
3697
3713
|
const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
|
|
3698
3714
|
const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
|
|
3699
3715
|
|
|
3700
|
-
//
|
|
3716
|
+
//log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
|
|
3701
3717
|
|
|
3702
3718
|
if (last_was_timestamp) {
|
|
3703
3719
|
if (penultimate_was_timestamp) {
|
|
@@ -3773,7 +3789,7 @@ static void whisper_process_logits(
|
|
|
3773
3789
|
|
|
3774
3790
|
const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
|
|
3775
3791
|
|
|
3776
|
-
//
|
|
3792
|
+
//log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
|
|
3777
3793
|
|
|
3778
3794
|
if (timestamp_logprob > max_text_token_logprob) {
|
|
3779
3795
|
for (int i = 0; i < vocab.token_beg; ++i) {
|
|
@@ -4022,12 +4038,12 @@ int whisper_full_with_state(
|
|
|
4022
4038
|
// compute log mel spectrogram
|
|
4023
4039
|
if (params.speed_up) {
|
|
4024
4040
|
if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
|
4025
|
-
|
|
4041
|
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
|
4026
4042
|
return -1;
|
|
4027
4043
|
}
|
|
4028
4044
|
} else {
|
|
4029
4045
|
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
|
4030
|
-
|
|
4046
|
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
|
4031
4047
|
return -2;
|
|
4032
4048
|
}
|
|
4033
4049
|
}
|
|
@@ -4038,13 +4054,13 @@ int whisper_full_with_state(
|
|
|
4038
4054
|
|
|
4039
4055
|
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
|
|
4040
4056
|
if (lang_id < 0) {
|
|
4041
|
-
|
|
4057
|
+
log("%s: failed to auto-detect language\n", __func__);
|
|
4042
4058
|
return -3;
|
|
4043
4059
|
}
|
|
4044
4060
|
state->lang_id = lang_id;
|
|
4045
4061
|
params.language = whisper_lang_str(lang_id);
|
|
4046
4062
|
|
|
4047
|
-
|
|
4063
|
+
log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
|
|
4048
4064
|
if (params.detect_language) {
|
|
4049
4065
|
return 0;
|
|
4050
4066
|
}
|
|
@@ -4101,7 +4117,7 @@ int whisper_full_with_state(
|
|
|
4101
4117
|
if (decoder.kv_self.ctx == nullptr) {
|
|
4102
4118
|
decoder.kv_self = state->decoders[0].kv_self;
|
|
4103
4119
|
if (!kv_cache_reinit(decoder.kv_self)) {
|
|
4104
|
-
|
|
4120
|
+
log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
|
|
4105
4121
|
return -4;
|
|
4106
4122
|
}
|
|
4107
4123
|
|
|
@@ -4145,7 +4161,7 @@ int whisper_full_with_state(
|
|
|
4145
4161
|
|
|
4146
4162
|
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
|
|
4147
4163
|
if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
|
|
4148
|
-
|
|
4164
|
+
log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
|
|
4149
4165
|
return -5;
|
|
4150
4166
|
}
|
|
4151
4167
|
state->exp_n_audio_ctx = params.audio_ctx;
|
|
@@ -4163,9 +4179,6 @@ int whisper_full_with_state(
|
|
|
4163
4179
|
}
|
|
4164
4180
|
}
|
|
4165
4181
|
|
|
4166
|
-
int progress_prev = 0;
|
|
4167
|
-
int progress_step = 5;
|
|
4168
|
-
|
|
4169
4182
|
int seek = seek_start;
|
|
4170
4183
|
|
|
4171
4184
|
std::vector<whisper_token> prompt;
|
|
@@ -4192,16 +4205,11 @@ int whisper_full_with_state(
|
|
|
4192
4205
|
|
|
4193
4206
|
// main loop
|
|
4194
4207
|
while (true) {
|
|
4195
|
-
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
|
4196
|
-
while (progress_cur >= progress_prev + progress_step) {
|
|
4197
|
-
progress_prev += progress_step;
|
|
4198
|
-
if (params.print_progress) {
|
|
4199
|
-
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
|
|
4200
|
-
}
|
|
4201
|
-
}
|
|
4202
4208
|
if (params.progress_callback) {
|
|
4209
|
+
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
|
4210
|
+
|
|
4203
4211
|
params.progress_callback(
|
|
4204
|
-
ctx, ctx->state,
|
|
4212
|
+
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
|
4205
4213
|
}
|
|
4206
4214
|
|
|
4207
4215
|
// of only 1 second left, then stop
|
|
@@ -4211,14 +4219,14 @@ int whisper_full_with_state(
|
|
|
4211
4219
|
|
|
4212
4220
|
if (params.encoder_begin_callback) {
|
|
4213
4221
|
if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
|
|
4214
|
-
|
|
4222
|
+
log("%s: encoder_begin_callback returned false - aborting\n", __func__);
|
|
4215
4223
|
break;
|
|
4216
4224
|
}
|
|
4217
4225
|
}
|
|
4218
4226
|
|
|
4219
4227
|
// encode audio features starting at offset seek
|
|
4220
4228
|
if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
|
|
4221
|
-
|
|
4229
|
+
log("%s: failed to encode\n", __func__);
|
|
4222
4230
|
return -6;
|
|
4223
4231
|
}
|
|
4224
4232
|
|
|
@@ -4301,7 +4309,7 @@ int whisper_full_with_state(
|
|
|
4301
4309
|
WHISPER_PRINT_DEBUG("\n\n");
|
|
4302
4310
|
|
|
4303
4311
|
if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
|
|
4304
|
-
|
|
4312
|
+
log("%s: failed to decode\n", __func__);
|
|
4305
4313
|
return -7;
|
|
4306
4314
|
}
|
|
4307
4315
|
|
|
@@ -4539,7 +4547,7 @@ int whisper_full_with_state(
|
|
|
4539
4547
|
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
|
|
4540
4548
|
|
|
4541
4549
|
if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
|
|
4542
|
-
|
|
4550
|
+
log("%s: failed to decode\n", __func__);
|
|
4543
4551
|
return -8;
|
|
4544
4552
|
}
|
|
4545
4553
|
|
|
@@ -4861,12 +4869,12 @@ int whisper_full_parallel(
|
|
|
4861
4869
|
ctx->state->t_decode_us /= n_processors;
|
|
4862
4870
|
|
|
4863
4871
|
// print information about the audio boundaries
|
|
4864
|
-
|
|
4865
|
-
|
|
4872
|
+
log("\n");
|
|
4873
|
+
log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
|
|
4866
4874
|
for (int i = 0; i < n_processors - 1; ++i) {
|
|
4867
|
-
|
|
4875
|
+
log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
|
|
4868
4876
|
}
|
|
4869
|
-
|
|
4877
|
+
log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
|
|
4870
4878
|
|
|
4871
4879
|
return ret;
|
|
4872
4880
|
}
|
|
@@ -5226,7 +5234,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
|
5226
5234
|
const int n_samples = state.energy.size();
|
|
5227
5235
|
|
|
5228
5236
|
if (n_samples == 0) {
|
|
5229
|
-
|
|
5237
|
+
log("%s: no signal data available\n", __func__);
|
|
5230
5238
|
return;
|
|
5231
5239
|
}
|
|
5232
5240
|
|
|
@@ -5446,3 +5454,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
|
5446
5454
|
// }
|
|
5447
5455
|
//}
|
|
5448
5456
|
}
|
|
5457
|
+
|
|
5458
|
+
void whisper_set_log_callback(whisper_log_callback callback) {
|
|
5459
|
+
whisper_log = callback;
|
|
5460
|
+
}
|