whisper.rn 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/whisper.cpp CHANGED
@@ -14,6 +14,7 @@
14
14
  #define _USE_MATH_DEFINES
15
15
  #include <cmath>
16
16
  #include <cstdio>
17
+ #include <cstdarg>
17
18
  #include <cstring>
18
19
  #include <fstream>
19
20
  #include <map>
@@ -92,7 +93,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
92
93
  #define WHISPER_ASSERT(x) \
93
94
  do { \
94
95
  if (!(x)) { \
95
- fprintf(stderr, "WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
96
+ log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
96
97
  abort(); \
97
98
  } \
98
99
  } while (0)
@@ -723,6 +724,21 @@ struct whisper_context {
723
724
  std::string path_model; // populated by whisper_init_from_file()
724
725
  };
725
726
 
727
+ static void whisper_default_log(const char * text) {
728
+ fprintf(stderr, "%s", text);
729
+ }
730
+
731
+ static whisper_log_callback whisper_log = whisper_default_log;
732
+
733
+ static void log(const char * fmt, ...) {
734
+ if (!whisper_log) return;
735
+ char buf[1024];
736
+ va_list args;
737
+ va_start(args, fmt);
738
+ vsnprintf(buf, sizeof(buf), fmt, args);
739
+ whisper_log(buf);
740
+ }
741
+
726
742
  template<typename T>
727
743
  static void read_safe(whisper_model_loader * loader, T & dest) {
728
744
  loader->read(loader->context, &dest, sizeof(T));
@@ -746,7 +762,7 @@ static bool kv_cache_init(
746
762
  cache.ctx = ggml_init(params);
747
763
 
748
764
  if (!cache.ctx) {
749
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
765
+ log("%s: failed to allocate memory for kv cache\n", __func__);
750
766
  return false;
751
767
  }
752
768
 
@@ -782,7 +798,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
782
798
  cache.ctx = ggml_init(params);
783
799
 
784
800
  if (!cache.ctx) {
785
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
801
+ log("%s: failed to allocate memory for kv cache\n", __func__);
786
802
  return false;
787
803
  }
788
804
 
@@ -811,7 +827,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
811
827
  // see the convert-pt-to-ggml.py script for details
812
828
  //
813
829
  static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
814
- fprintf(stderr, "%s: loading model\n", __func__);
830
+ log("%s: loading model\n", __func__);
815
831
 
816
832
  const int64_t t_start_us = ggml_time_us();
817
833
 
@@ -825,7 +841,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
825
841
  uint32_t magic;
826
842
  read_safe(loader, magic);
827
843
  if (magic != GGML_FILE_MAGIC) {
828
- fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
844
+ log("%s: invalid model data (bad magic)\n", __func__);
829
845
  return false;
830
846
  }
831
847
  }
@@ -876,25 +892,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
876
892
  // in order to save memory and also to speed up the computation
877
893
  wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
878
894
  if (wctx.wtype == GGML_TYPE_COUNT) {
879
- fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
895
+ log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
880
896
  return false;
881
897
  }
882
898
 
883
899
  const size_t scale = model.hparams.ftype ? 1 : 2;
884
900
 
885
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
886
- fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
887
- fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
888
- fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
889
- fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
890
- fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
891
- fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
892
- fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
893
- fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
894
- fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
895
- fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
896
- fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
897
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
901
+ log("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
902
+ log("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
903
+ log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
904
+ log("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
905
+ log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
906
+ log("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
907
+ log("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
908
+ log("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
909
+ log("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
910
+ log("%s: n_mels = %d\n", __func__, hparams.n_mels);
911
+ log("%s: ftype = %d\n", __func__, model.hparams.ftype);
912
+ log("%s: qntvr = %d\n", __func__, qntvr);
913
+ log("%s: type = %d\n", __func__, model.type);
898
914
 
899
915
  // print memory requirements
900
916
  {
@@ -912,7 +928,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
912
928
  const size_t mem_required_decoder =
913
929
  scale*MEM_REQ_KV_SELF.at(model.type);
914
930
 
915
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
931
+ log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
916
932
  mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
917
933
  }
918
934
 
@@ -944,7 +960,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
944
960
  read_safe(loader, n_vocab);
945
961
 
946
962
  //if (n_vocab != model.hparams.n_vocab) {
947
- // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
963
+ // log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
948
964
  // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
949
965
  // return false;
950
966
  //}
@@ -964,7 +980,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
964
980
  word.assign(&tmp[0], tmp.size());
965
981
  } else {
966
982
  // seems like we have an empty-string token in multi-language models (i = 50256)
967
- //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
983
+ //log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
968
984
  word = "";
969
985
  }
970
986
 
@@ -988,7 +1004,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
988
1004
  }
989
1005
 
990
1006
  if (n_vocab < model.hparams.n_vocab) {
991
- fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
1007
+ log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
992
1008
  for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
993
1009
  if (i > vocab.token_beg) {
994
1010
  word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
@@ -1127,7 +1143,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1127
1143
 
1128
1144
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
1129
1145
 
1130
- fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1146
+ log("%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
1131
1147
  }
1132
1148
 
1133
1149
  // create the ggml context
@@ -1140,7 +1156,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1140
1156
 
1141
1157
  model.ctx = ggml_init(params);
1142
1158
  if (!model.ctx) {
1143
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
1159
+ log("%s: ggml_init() failed\n", __func__);
1144
1160
  return false;
1145
1161
  }
1146
1162
  }
@@ -1373,20 +1389,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1373
1389
  name.assign(&tmp[0], tmp.size());
1374
1390
 
1375
1391
  if (model.tensors.find(name) == model.tensors.end()) {
1376
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
1392
+ log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
1377
1393
  return false;
1378
1394
  }
1379
1395
 
1380
1396
  auto tensor = model.tensors[name.data()];
1381
1397
  if (ggml_nelements(tensor) != nelements) {
1382
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
1383
- fprintf(stderr, "%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
1398
+ log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
1399
+ log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
1384
1400
  __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
1385
1401
  return false;
1386
1402
  }
1387
1403
 
1388
1404
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
1389
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
1405
+ log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
1390
1406
  __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
1391
1407
  return false;
1392
1408
  }
@@ -1394,7 +1410,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1394
1410
  const size_t bpe = ggml_type_size(ggml_type(ttype));
1395
1411
 
1396
1412
  if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1397
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1413
+ log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1398
1414
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1399
1415
  return false;
1400
1416
  }
@@ -1407,12 +1423,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1407
1423
  model.n_loaded++;
1408
1424
  }
1409
1425
 
1410
- fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
1426
+ log("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
1411
1427
 
1412
1428
  if (model.n_loaded == 0) {
1413
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1429
+ log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1414
1430
  } else if (model.n_loaded != (int) model.tensors.size()) {
1415
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
1431
+ log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
1416
1432
  return false;
1417
1433
  }
1418
1434
  }
@@ -2616,7 +2632,7 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
2616
2632
  --j;
2617
2633
  }
2618
2634
  if (!found) {
2619
- fprintf(stderr, "unknown token \n");
2635
+ log("unknown token\n");
2620
2636
  ++i;
2621
2637
  }
2622
2638
  }
@@ -2683,41 +2699,41 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2683
2699
  const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
2684
2700
 
2685
2701
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
2686
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2702
+ log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
2687
2703
  delete state;
2688
2704
  return nullptr;
2689
2705
  }
2690
2706
 
2691
2707
  {
2692
2708
  const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
2693
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2709
+ log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2694
2710
  }
2695
2711
 
2696
2712
  if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
2697
- fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2713
+ log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2698
2714
  delete state;
2699
2715
  return nullptr;
2700
2716
  }
2701
2717
 
2702
2718
  {
2703
2719
  const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
2704
- fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2720
+ log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2705
2721
  }
2706
2722
 
2707
2723
  #ifdef WHISPER_USE_COREML
2708
2724
  const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
2709
2725
 
2710
- fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
2711
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2726
+ log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
2727
+ log("%s: first run on a device may take a while ...\n", __func__);
2712
2728
 
2713
2729
  state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2714
2730
  if (!state->ctx_coreml) {
2715
- fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2731
+ log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2716
2732
  #ifndef WHISPER_COREML_ALLOW_FALLBACK
2717
2733
  return nullptr;
2718
2734
  #endif
2719
2735
  } else {
2720
- fprintf(stderr, "%s: Core ML model loaded\n", __func__);
2736
+ log("%s: Core ML model loaded\n", __func__);
2721
2737
  }
2722
2738
  #endif
2723
2739
 
@@ -2757,7 +2773,7 @@ int whisper_ctx_init_openvino_encoder(
2757
2773
  return 1;
2758
2774
  #else
2759
2775
  if (!model_path && ctx->path_model.empty()) {
2760
- fprintf(stderr, "%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
2776
+ log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
2761
2777
  return 1;
2762
2778
  }
2763
2779
 
@@ -2777,15 +2793,15 @@ int whisper_ctx_init_openvino_encoder(
2777
2793
  path_cache = cache_dir;
2778
2794
  }
2779
2795
 
2780
- fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
2781
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2796
+ log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
2797
+ log("%s: first run on a device may take a while ...\n", __func__);
2782
2798
 
2783
2799
  ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
2784
2800
  if (!ctx->state->ctx_openvino) {
2785
- fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
2801
+ log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
2786
2802
  return 1;
2787
2803
  } else {
2788
- fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
2804
+ log("%s: OpenVINO model loaded\n", __func__);
2789
2805
  }
2790
2806
 
2791
2807
  return 0;
@@ -2794,11 +2810,11 @@ int whisper_ctx_init_openvino_encoder(
2794
2810
 
2795
2811
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
2796
2812
 
2797
- fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
2813
+ log("%s: loading model from '%s'\n", __func__, path_model);
2798
2814
 
2799
2815
  auto fin = std::ifstream(path_model, std::ios::binary);
2800
2816
  if (!fin) {
2801
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_model);
2817
+ log("%s: failed to open '%s'\n", __func__, path_model);
2802
2818
  return nullptr;
2803
2819
  }
2804
2820
 
@@ -2840,7 +2856,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
2840
2856
 
2841
2857
  buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
2842
2858
 
2843
- fprintf(stderr, "%s: loading model from buffer\n", __func__);
2859
+ log("%s: loading model from buffer\n", __func__);
2844
2860
 
2845
2861
  whisper_model_loader loader = {};
2846
2862
 
@@ -2875,7 +2891,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa
2875
2891
 
2876
2892
  if (!whisper_model_load(loader, *ctx)) {
2877
2893
  loader->close(loader->context);
2878
- fprintf(stderr, "%s: failed to load model\n", __func__);
2894
+ log("%s: failed to load model\n", __func__);
2879
2895
  delete ctx;
2880
2896
  return nullptr;
2881
2897
  }
@@ -2980,7 +2996,7 @@ void whisper_free_params(struct whisper_full_params * params) {
2980
2996
 
2981
2997
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2982
2998
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
2983
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2999
+ log("%s: failed to compute mel spectrogram\n", __func__);
2984
3000
  return -1;
2985
3001
  }
2986
3002
 
@@ -2994,7 +3010,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
2994
3010
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
2995
3011
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
2996
3012
  if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
2997
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
3013
+ log("%s: failed to compute mel spectrogram\n", __func__);
2998
3014
  return -1;
2999
3015
  }
3000
3016
 
@@ -3013,7 +3029,7 @@ int whisper_set_mel_with_state(
3013
3029
  int n_len,
3014
3030
  int n_mel) {
3015
3031
  if (n_mel != WHISPER_N_MEL) {
3016
- fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
3032
+ log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
3017
3033
  return -1;
3018
3034
  }
3019
3035
 
@@ -3037,7 +3053,7 @@ int whisper_set_mel(
3037
3053
 
3038
3054
  int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
3039
3055
  if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
3040
- fprintf(stderr, "%s: failed to eval\n", __func__);
3056
+ log("%s: failed to eval\n", __func__);
3041
3057
  return -1;
3042
3058
  }
3043
3059
 
@@ -3046,7 +3062,7 @@ int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state
3046
3062
 
3047
3063
  int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
3048
3064
  if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
3049
- fprintf(stderr, "%s: failed to eval\n", __func__);
3065
+ log("%s: failed to eval\n", __func__);
3050
3066
  return -1;
3051
3067
  }
3052
3068
 
@@ -3057,7 +3073,7 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
3057
3073
  const int selected_decoder_id = 0;
3058
3074
 
3059
3075
  if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3060
- fprintf(stderr, "%s: failed to eval\n", __func__);
3076
+ log("%s: failed to eval\n", __func__);
3061
3077
  return 1;
3062
3078
  }
3063
3079
 
@@ -3069,13 +3085,13 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
3069
3085
  const int selected_decoder_id = 0;
3070
3086
 
3071
3087
  if (ctx->state == nullptr) {
3072
- fprintf(stderr, "%s: ERROR state was not loaded.\n", __func__);
3088
+ log("%s: ERROR state was not loaded.\n", __func__);
3073
3089
  return false;
3074
3090
  }
3075
3091
 
3076
3092
 
3077
3093
  if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
3078
- fprintf(stderr, "%s: failed to eval\n", __func__);
3094
+ log("%s: failed to eval\n", __func__);
3079
3095
  return 1;
3080
3096
  }
3081
3097
 
@@ -3086,7 +3102,7 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
3086
3102
  const auto res = tokenize(ctx->vocab, text);
3087
3103
 
3088
3104
  if (n_max_tokens < (int) res.size()) {
3089
- fprintf(stderr, "%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
3105
+ log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
3090
3106
  return -1;
3091
3107
  }
3092
3108
 
@@ -3114,7 +3130,7 @@ int whisper_lang_id(const char * lang) {
3114
3130
  }
3115
3131
  }
3116
3132
 
3117
- fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
3133
+ log("%s: unknown language '%s'\n", __func__, lang);
3118
3134
  return -1;
3119
3135
  }
3120
3136
  return g_lang.at(lang).first;
@@ -3127,7 +3143,7 @@ const char * whisper_lang_str(int id) {
3127
3143
  }
3128
3144
  }
3129
3145
 
3130
- fprintf(stderr, "%s: unknown language id %d\n", __func__, id);
3146
+ log("%s: unknown language id %d\n", __func__, id);
3131
3147
  return nullptr;
3132
3148
  }
3133
3149
 
@@ -3140,25 +3156,25 @@ int whisper_lang_auto_detect_with_state(
3140
3156
  const int seek = offset_ms/10;
3141
3157
 
3142
3158
  if (seek < 0) {
3143
- fprintf(stderr, "%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
3159
+ log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
3144
3160
  return -1;
3145
3161
  }
3146
3162
 
3147
3163
  if (seek >= state->mel.n_len_org) {
3148
- fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
3164
+ log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
3149
3165
  return -2;
3150
3166
  }
3151
3167
 
3152
3168
  // run the encoder
3153
3169
  if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
3154
- fprintf(stderr, "%s: failed to encode\n", __func__);
3170
+ log("%s: failed to encode\n", __func__);
3155
3171
  return -6;
3156
3172
  }
3157
3173
 
3158
3174
  const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
3159
3175
 
3160
3176
  if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
3161
- fprintf(stderr, "%s: failed to decode\n", __func__);
3177
+ log("%s: failed to decode\n", __func__);
3162
3178
  return -7;
3163
3179
  }
3164
3180
 
@@ -3359,21 +3375,21 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
3359
3375
  void whisper_print_timings(struct whisper_context * ctx) {
3360
3376
  const int64_t t_end_us = ggml_time_us();
3361
3377
 
3362
- fprintf(stderr, "\n");
3363
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
3378
+ log("\n");
3379
+ log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
3364
3380
  if (ctx->state != nullptr) {
3365
3381
 
3366
3382
  const int32_t n_sample = std::max(1, ctx->state->n_sample);
3367
3383
  const int32_t n_encode = std::max(1, ctx->state->n_encode);
3368
3384
  const int32_t n_decode = std::max(1, ctx->state->n_decode);
3369
3385
 
3370
- fprintf(stderr, "%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
3371
- fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
3372
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
3373
- fprintf(stderr, "%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
3374
- fprintf(stderr, "%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
3386
+ log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
3387
+ log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
3388
+ log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
3389
+ log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
3390
+ log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
3375
3391
  }
3376
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
3392
+ log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
3377
3393
  }
3378
3394
 
3379
3395
  void whisper_reset_timings(struct whisper_context * ctx) {
@@ -3697,7 +3713,7 @@ static void whisper_process_logits(
3697
3713
  const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
3698
3714
  const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
3699
3715
 
3700
- //fprintf(stderr, "last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
3716
+ //log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
3701
3717
 
3702
3718
  if (last_was_timestamp) {
3703
3719
  if (penultimate_was_timestamp) {
@@ -3773,7 +3789,7 @@ static void whisper_process_logits(
3773
3789
 
3774
3790
  const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
3775
3791
 
3776
- //fprintf(stderr, "timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
3792
+ //log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
3777
3793
 
3778
3794
  if (timestamp_logprob > max_text_token_logprob) {
3779
3795
  for (int i = 0; i < vocab.token_beg; ++i) {
@@ -4022,12 +4038,12 @@ int whisper_full_with_state(
4022
4038
  // compute log mel spectrogram
4023
4039
  if (params.speed_up) {
4024
4040
  if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4025
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
4041
+ log("%s: failed to compute log mel spectrogram\n", __func__);
4026
4042
  return -1;
4027
4043
  }
4028
4044
  } else {
4029
4045
  if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
4030
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
4046
+ log("%s: failed to compute log mel spectrogram\n", __func__);
4031
4047
  return -2;
4032
4048
  }
4033
4049
  }
@@ -4038,13 +4054,13 @@ int whisper_full_with_state(
4038
4054
 
4039
4055
  const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
4040
4056
  if (lang_id < 0) {
4041
- fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
4057
+ log("%s: failed to auto-detect language\n", __func__);
4042
4058
  return -3;
4043
4059
  }
4044
4060
  state->lang_id = lang_id;
4045
4061
  params.language = whisper_lang_str(lang_id);
4046
4062
 
4047
- fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
4063
+ log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
4048
4064
  if (params.detect_language) {
4049
4065
  return 0;
4050
4066
  }
@@ -4101,7 +4117,7 @@ int whisper_full_with_state(
4101
4117
  if (decoder.kv_self.ctx == nullptr) {
4102
4118
  decoder.kv_self = state->decoders[0].kv_self;
4103
4119
  if (!kv_cache_reinit(decoder.kv_self)) {
4104
- fprintf(stderr, "%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
4120
+ log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
4105
4121
  return -4;
4106
4122
  }
4107
4123
 
@@ -4145,7 +4161,7 @@ int whisper_full_with_state(
4145
4161
 
4146
4162
  // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
4147
4163
  if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
4148
- fprintf(stderr, "%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
4164
+ log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
4149
4165
  return -5;
4150
4166
  }
4151
4167
  state->exp_n_audio_ctx = params.audio_ctx;
@@ -4163,9 +4179,6 @@ int whisper_full_with_state(
4163
4179
  }
4164
4180
  }
4165
4181
 
4166
- int progress_prev = 0;
4167
- int progress_step = 5;
4168
-
4169
4182
  int seek = seek_start;
4170
4183
 
4171
4184
  std::vector<whisper_token> prompt;
@@ -4192,16 +4205,11 @@ int whisper_full_with_state(
4192
4205
 
4193
4206
  // main loop
4194
4207
  while (true) {
4195
- const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
4196
- while (progress_cur >= progress_prev + progress_step) {
4197
- progress_prev += progress_step;
4198
- if (params.print_progress) {
4199
- fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
4200
- }
4201
- }
4202
4208
  if (params.progress_callback) {
4209
+ const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
4210
+
4203
4211
  params.progress_callback(
4204
- ctx, ctx->state, progress_prev, params.progress_callback_user_data);
4212
+ ctx, ctx->state, progress_cur, params.progress_callback_user_data);
4205
4213
  }
4206
4214
 
4207
4215
  // of only 1 second left, then stop
@@ -4211,14 +4219,14 @@ int whisper_full_with_state(
4211
4219
 
4212
4220
  if (params.encoder_begin_callback) {
4213
4221
  if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
4214
- fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
4222
+ log("%s: encoder_begin_callback returned false - aborting\n", __func__);
4215
4223
  break;
4216
4224
  }
4217
4225
  }
4218
4226
 
4219
4227
  // encode audio features starting at offset seek
4220
4228
  if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
4221
- fprintf(stderr, "%s: failed to encode\n", __func__);
4229
+ log("%s: failed to encode\n", __func__);
4222
4230
  return -6;
4223
4231
  }
4224
4232
 
@@ -4301,7 +4309,7 @@ int whisper_full_with_state(
4301
4309
  WHISPER_PRINT_DEBUG("\n\n");
4302
4310
 
4303
4311
  if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
4304
- fprintf(stderr, "%s: failed to decode\n", __func__);
4312
+ log("%s: failed to decode\n", __func__);
4305
4313
  return -7;
4306
4314
  }
4307
4315
 
@@ -4539,7 +4547,7 @@ int whisper_full_with_state(
4539
4547
  //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
4540
4548
 
4541
4549
  if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
4542
- fprintf(stderr, "%s: failed to decode\n", __func__);
4550
+ log("%s: failed to decode\n", __func__);
4543
4551
  return -8;
4544
4552
  }
4545
4553
 
@@ -4861,12 +4869,12 @@ int whisper_full_parallel(
4861
4869
  ctx->state->t_decode_us /= n_processors;
4862
4870
 
4863
4871
  // print information about the audio boundaries
4864
- fprintf(stderr, "\n");
4865
- fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
4872
+ log("\n");
4873
+ log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
4866
4874
  for (int i = 0; i < n_processors - 1; ++i) {
4867
- fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
4875
+ log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
4868
4876
  }
4869
- fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
4877
+ log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
4870
4878
 
4871
4879
  return ret;
4872
4880
  }
@@ -5226,7 +5234,7 @@ static void whisper_exp_compute_token_level_timestamps(
5226
5234
  const int n_samples = state.energy.size();
5227
5235
 
5228
5236
  if (n_samples == 0) {
5229
- fprintf(stderr, "%s: no signal data available\n", __func__);
5237
+ log("%s: no signal data available\n", __func__);
5230
5238
  return;
5231
5239
  }
5232
5240
 
@@ -5446,3 +5454,7 @@ static void whisper_exp_compute_token_level_timestamps(
5446
5454
  // }
5447
5455
  //}
5448
5456
  }
5457
+
5458
+ void whisper_set_log_callback(whisper_log_callback callback) {
5459
+ whisper_log = callback;
5460
+ }