whisper.rn 0.3.0-rc.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/whisper.cpp CHANGED
@@ -1,8 +1,12 @@
1
1
  #include "whisper.h"
2
- #if WHISPER_USE_COREML
2
+ #ifdef WHISPER_USE_COREML
3
3
  #include "coreml/whisper-encoder.h"
4
4
  #endif
5
5
 
6
+ #if WHISPER_USE_OPENVINO
7
+ #include "openvino/whisper-openvino-encoder.h"
8
+ #endif
9
+
6
10
  #include "ggml.h"
7
11
 
8
12
  #include <algorithm>
@@ -19,6 +23,10 @@
19
23
  #include <regex>
20
24
  #include <random>
21
25
 
26
+ #if defined(_MSC_VER)
27
+ #pragma warning(disable: 4244 4267) // possible loss of data
28
+ #endif
29
+
22
30
  #if defined(GGML_BIG_ENDIAN)
23
31
  #include <bit>
24
32
 
@@ -376,16 +384,18 @@ struct whisper_vocab {
376
384
  std::map<token, id> token_to_id;
377
385
  std::map<id, token> id_to_token;
378
386
 
379
- id token_eot = 50256;
380
- id token_sot = 50257;
381
- id token_prev = 50360;
382
- id token_solm = 50361; // ??
383
- id token_not = 50362; // no timestamps
384
- id token_beg = 50363;
385
-
386
- // available tasks
387
- static const id token_translate = 50358;
388
- static const id token_transcribe = 50359;
387
+ // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
388
+ id token_eot = 50256;
389
+ id token_sot = 50257;
390
+ // task tokens (used only for multilingual models)
391
+ id token_translate = 50357;
392
+ id token_transcribe = 50358;
393
+ // other special tokens
394
+ id token_solm = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
395
+ id token_prev = 50360;
396
+ id token_nosp = 50361;
397
+ id token_not = 50362; // no timestamps
398
+ id token_beg = 50363; // begin timestamps
389
399
 
390
400
  bool is_multilingual() const {
391
401
  return n_vocab == 51865;
@@ -399,6 +409,8 @@ struct whisper_segment {
399
409
  std::string text;
400
410
 
401
411
  std::vector<whisper_token_data> tokens;
412
+
413
+ bool speaker_turn_next;
402
414
  };
403
415
 
404
416
  // medium
@@ -652,6 +664,10 @@ struct whisper_state {
652
664
  whisper_coreml_context * ctx_coreml = nullptr;
653
665
  #endif
654
666
 
667
+ #ifdef WHISPER_USE_OPENVINO
668
+ whisper_openvino_context * ctx_openvino = nullptr;
669
+ #endif
670
+
655
671
  // [EXPERIMENTAL] token-level timestamps data
656
672
  int64_t t_beg = 0;
657
673
  int64_t t_last = 0;
@@ -808,7 +824,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
808
824
  {
809
825
  uint32_t magic;
810
826
  read_safe(loader, magic);
811
- if (magic != 0x67676d6c) {
827
+ if (magic != GGML_FILE_MAGIC) {
812
828
  fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
813
829
  return false;
814
830
  }
@@ -962,8 +978,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
962
978
  if (vocab.is_multilingual()) {
963
979
  vocab.token_eot++;
964
980
  vocab.token_sot++;
965
- vocab.token_prev++;
981
+ vocab.token_translate++;
982
+ vocab.token_transcribe++;
966
983
  vocab.token_solm++;
984
+ vocab.token_prev++;
985
+ vocab.token_nosp++;
967
986
  vocab.token_not++;
968
987
  vocab.token_beg++;
969
988
  }
@@ -977,8 +996,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
977
996
  word = "[_EOT_]";
978
997
  } else if (i == vocab.token_sot) {
979
998
  word = "[_SOT_]";
999
+ } else if (i == vocab.token_solm) {
1000
+ word = "[_SOLM_]";
980
1001
  } else if (i == vocab.token_prev) {
981
1002
  word = "[_PREV_]";
1003
+ } else if (i == vocab.token_nosp) {
1004
+ word = "[_NOSP_]";
982
1005
  } else if (i == vocab.token_not) {
983
1006
  word = "[_NOT_]";
984
1007
  } else if (i == vocab.token_beg) {
@@ -1463,12 +1486,18 @@ static bool whisper_encode_internal(
1463
1486
  const bool use_coreml = wstate.ctx_coreml != nullptr;
1464
1487
  #endif
1465
1488
 
1466
- if (!use_coreml) {
1489
+ #ifndef WHISPER_USE_OPENVINO
1490
+ const bool use_openvino = false;
1491
+ #else
1492
+ const bool use_openvino = wstate.ctx_openvino != nullptr;
1493
+ #endif
1494
+
1495
+ if (!use_coreml && !use_openvino) {
1467
1496
  // convolution + gelu
1468
1497
  {
1469
1498
  wstate.use_buf(ctx0, 1);
1470
1499
 
1471
- cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
1500
+ cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
1472
1501
  cur = ggml_add(ctx0,
1473
1502
  ggml_repeat(ctx0,
1474
1503
  model.e_conv_1_b,
@@ -1479,7 +1508,7 @@ static bool whisper_encode_internal(
1479
1508
 
1480
1509
  wstate.use_buf(ctx0, 0);
1481
1510
 
1482
- cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
1511
+ cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
1483
1512
  cur = ggml_add(ctx0,
1484
1513
  ggml_repeat(ctx0,
1485
1514
  model.e_conv_2_b,
@@ -1762,8 +1791,7 @@ static bool whisper_encode_internal(
1762
1791
  }
1763
1792
  }
1764
1793
  #ifdef WHISPER_USE_COREML
1765
- else
1766
- {
1794
+ else if (use_coreml) {
1767
1795
  wstate.use_buf(ctx0, -1);
1768
1796
 
1769
1797
  cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
@@ -1771,6 +1799,17 @@ static bool whisper_encode_internal(
1771
1799
  whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
1772
1800
  }
1773
1801
  #endif
1802
+ #ifdef WHISPER_USE_OPENVINO
1803
+ else if (use_openvino) {
1804
+ wstate.use_buf(ctx0, -1);
1805
+
1806
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
1807
+
1808
+ if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
1809
+ return false;
1810
+ }
1811
+ }
1812
+ #endif
1774
1813
 
1775
1814
  // cur
1776
1815
  //{
@@ -2613,6 +2652,31 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
2613
2652
  }
2614
2653
  #endif
2615
2654
 
2655
+ #ifdef WHISPER_USE_OPENVINO
2656
+ // replace .bin with-encoder-openvino.xml
2657
+ static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
2658
+ auto pos = path_bin.rfind('.');
2659
+ if (pos != std::string::npos) {
2660
+ path_bin = path_bin.substr(0, pos);
2661
+ }
2662
+
2663
+ path_bin += "-encoder-openvino.xml";
2664
+
2665
+ return path_bin;
2666
+ }
2667
+
2668
+ static std::string whisper_openvino_get_path_cache(std::string path_bin) {
2669
+ auto pos = path_bin.rfind('.');
2670
+ if (pos != std::string::npos) {
2671
+ path_bin = path_bin.substr(0, pos);
2672
+ }
2673
+
2674
+ path_bin += "-encoder-openvino-cache";
2675
+
2676
+ return path_bin;
2677
+ }
2678
+ #endif
2679
+
2616
2680
  struct whisper_state * whisper_init_state(whisper_context * ctx) {
2617
2681
  whisper_state * state = new whisper_state;
2618
2682
 
@@ -2679,6 +2743,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2679
2743
  return state;
2680
2744
  }
2681
2745
 
2746
+ int whisper_ctx_init_openvino_encoder(
2747
+ struct whisper_context * ctx,
2748
+ const char * model_path,
2749
+ const char * device,
2750
+ const char * cache_dir) {
2751
+ #ifndef WHISPER_USE_OPENVINO
2752
+ (void)(ctx);
2753
+ (void)(model_path);
2754
+ (void)(device);
2755
+ (void)(cache_dir);
2756
+
2757
+ return 1;
2758
+ #else
2759
+ if (!model_path && ctx->path_model.empty()) {
2760
+ fprintf(stderr, "%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
2761
+ return 1;
2762
+ }
2763
+
2764
+ std::string path_encoder;
2765
+ if (!model_path) {
2766
+ //if model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
2767
+ path_encoder = whisper_openvino_get_path_encoder(ctx->path_model);
2768
+ } else {
2769
+ path_encoder = model_path;
2770
+ }
2771
+
2772
+ std::string path_cache;
2773
+ if (!cache_dir) {
2774
+ //if cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
2775
+ path_cache = whisper_openvino_get_path_cache(ctx->path_model);
2776
+ } else {
2777
+ path_cache = cache_dir;
2778
+ }
2779
+
2780
+ fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
2781
+ fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2782
+
2783
+ ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
2784
+ if (!ctx->state->ctx_openvino) {
2785
+ fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
2786
+ return 1;
2787
+ } else {
2788
+ fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
2789
+ }
2790
+
2791
+ return 0;
2792
+ #endif
2793
+ }
2794
+
2682
2795
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
2683
2796
 
2684
2797
  fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
@@ -2833,6 +2946,13 @@ void whisper_free_state(struct whisper_state * state)
2833
2946
  }
2834
2947
  #endif
2835
2948
 
2949
+ #ifdef WHISPER_USE_OPENVINO
2950
+ if (state->ctx_openvino != nullptr) {
2951
+ whisper_openvino_free(state->ctx_openvino);
2952
+ state->ctx_openvino = nullptr;
2953
+ }
2954
+ #endif
2955
+
2836
2956
  delete state;
2837
2957
  }
2838
2958
  }
@@ -3204,12 +3324,16 @@ whisper_token whisper_token_sot(struct whisper_context * ctx) {
3204
3324
  return ctx->vocab.token_sot;
3205
3325
  }
3206
3326
 
3327
+ whisper_token whisper_token_solm(struct whisper_context * ctx) {
3328
+ return ctx->vocab.token_solm;
3329
+ }
3330
+
3207
3331
  whisper_token whisper_token_prev(struct whisper_context * ctx) {
3208
3332
  return ctx->vocab.token_prev;
3209
3333
  }
3210
3334
 
3211
- whisper_token whisper_token_solm(struct whisper_context * ctx) {
3212
- return ctx->vocab.token_solm;
3335
+ whisper_token whisper_token_nosp(struct whisper_context * ctx) {
3336
+ return ctx->vocab.token_nosp;
3213
3337
  }
3214
3338
 
3215
3339
  whisper_token whisper_token_not(struct whisper_context * ctx) {
@@ -3224,12 +3348,12 @@ whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
3224
3348
  return whisper_token_sot(ctx) + 1 + lang_id;
3225
3349
  }
3226
3350
 
3227
- whisper_token whisper_token_translate(void) {
3228
- return whisper_vocab::token_translate;
3351
+ whisper_token whisper_token_translate(struct whisper_context * ctx) {
3352
+ return ctx->vocab.token_translate;
3229
3353
  }
3230
3354
 
3231
- whisper_token whisper_token_transcribe(void) {
3232
- return whisper_vocab::token_transcribe;
3355
+ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
3356
+ return ctx->vocab.token_transcribe;
3233
3357
  }
3234
3358
 
3235
3359
  void whisper_print_timings(struct whisper_context * ctx) {
@@ -3268,6 +3392,14 @@ static int whisper_has_coreml(void) {
3268
3392
  #endif
3269
3393
  }
3270
3394
 
3395
+ static int whisper_has_openvino(void) {
3396
+ #ifdef WHISPER_USE_OPENVINO
3397
+ return 1;
3398
+ #else
3399
+ return 0;
3400
+ #endif
3401
+ }
3402
+
3271
3403
  const char * whisper_print_system_info(void) {
3272
3404
  static std::string s;
3273
3405
 
@@ -3285,6 +3417,7 @@ const char * whisper_print_system_info(void) {
3285
3417
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
3286
3418
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
3287
3419
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
3420
+ s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
3288
3421
 
3289
3422
  return s.c_str();
3290
3423
  }
@@ -3301,51 +3434,53 @@ struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sam
3301
3434
 
3302
3435
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
3303
3436
  struct whisper_full_params result = {
3304
- /*.strategy =*/ strategy,
3305
-
3306
- /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
3307
- /*.n_max_text_ctx =*/ 16384,
3308
- /*.offset_ms =*/ 0,
3309
- /*.duration_ms =*/ 0,
3310
-
3311
- /*.translate =*/ false,
3312
- /*.no_context =*/ true,
3313
- /*.single_segment =*/ false,
3314
- /*.print_special =*/ false,
3315
- /*.print_progress =*/ true,
3316
- /*.print_realtime =*/ false,
3317
- /*.print_timestamps =*/ true,
3318
-
3319
- /*.token_timestamps =*/ false,
3320
- /*.thold_pt =*/ 0.01f,
3321
- /*.thold_ptsum =*/ 0.01f,
3322
- /*.max_len =*/ 0,
3323
- /*.split_on_word =*/ false,
3324
- /*.max_tokens =*/ 0,
3325
-
3326
- /*.speed_up =*/ false,
3327
- /*.audio_ctx =*/ 0,
3328
-
3329
- /*.initial_prompt =*/ nullptr,
3330
- /*.prompt_tokens =*/ nullptr,
3331
- /*.prompt_n_tokens =*/ 0,
3332
-
3333
- /*.language =*/ "en",
3334
- /*.detect_language =*/ false,
3335
-
3336
- /*.suppress_blank =*/ true,
3437
+ /*.strategy =*/ strategy,
3438
+
3439
+ /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
3440
+ /*.n_max_text_ctx =*/ 16384,
3441
+ /*.offset_ms =*/ 0,
3442
+ /*.duration_ms =*/ 0,
3443
+
3444
+ /*.translate =*/ false,
3445
+ /*.no_context =*/ true,
3446
+ /*.single_segment =*/ false,
3447
+ /*.print_special =*/ false,
3448
+ /*.print_progress =*/ true,
3449
+ /*.print_realtime =*/ false,
3450
+ /*.print_timestamps =*/ true,
3451
+
3452
+ /*.token_timestamps =*/ false,
3453
+ /*.thold_pt =*/ 0.01f,
3454
+ /*.thold_ptsum =*/ 0.01f,
3455
+ /*.max_len =*/ 0,
3456
+ /*.split_on_word =*/ false,
3457
+ /*.max_tokens =*/ 0,
3458
+
3459
+ /*.speed_up =*/ false,
3460
+ /*.audio_ctx =*/ 0,
3461
+
3462
+ /*.tdrz_enable =*/ false,
3463
+
3464
+ /*.initial_prompt =*/ nullptr,
3465
+ /*.prompt_tokens =*/ nullptr,
3466
+ /*.prompt_n_tokens =*/ 0,
3467
+
3468
+ /*.language =*/ "en",
3469
+ /*.detect_language =*/ false,
3470
+
3471
+ /*.suppress_blank =*/ true,
3337
3472
  /*.suppress_non_speech_tokens =*/ false,
3338
3473
 
3339
- /*.temperature =*/ 0.0f,
3340
- /*.max_initial_ts =*/ 1.0f,
3341
- /*.length_penalty =*/ -1.0f,
3474
+ /*.temperature =*/ 0.0f,
3475
+ /*.max_initial_ts =*/ 1.0f,
3476
+ /*.length_penalty =*/ -1.0f,
3342
3477
 
3343
- /*.temperature_inc =*/ 0.4f,
3344
- /*.entropy_thold =*/ 2.4f,
3345
- /*.logprob_thold =*/ -1.0f,
3346
- /*.no_speech_thold =*/ 0.6f,
3478
+ /*.temperature_inc =*/ 0.4f,
3479
+ /*.entropy_thold =*/ 2.4f,
3480
+ /*.logprob_thold =*/ -1.0f,
3481
+ /*.no_speech_thold =*/ 0.6f,
3347
3482
 
3348
- /*.greedy =*/ {
3483
+ /*.greedy =*/ {
3349
3484
  /*.best_of =*/ -1,
3350
3485
  },
3351
3486
 
@@ -3397,26 +3532,6 @@ static void whisper_exp_compute_token_level_timestamps(
3397
3532
  float thold_pt,
3398
3533
  float thold_ptsum);
3399
3534
 
3400
- // trim from start (in place)
3401
- static inline void ltrim(std::string &s) {
3402
- s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
3403
- return std::isspace(ch);
3404
- }));
3405
- }
3406
-
3407
- // trim from end (in place)
3408
- static inline void rtrim(std::string &s) {
3409
- s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
3410
- return std::isspace(ch);
3411
- }).base(), s.end());
3412
- }
3413
-
3414
- // trim from both ends (in place)
3415
- static inline void trim(std::string &s) {
3416
- rtrim(s);
3417
- ltrim(s);
3418
- }
3419
-
3420
3535
  static inline bool should_split_on_word(const char * txt, bool split_on_word) {
3421
3536
  if (!split_on_word) return true;
3422
3537
 
@@ -3443,14 +3558,10 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
3443
3558
  const int cur = strlen(txt);
3444
3559
 
3445
3560
  if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
3446
- // split here
3447
- if (split_on_word) {
3448
- trim(text);
3449
- }
3450
-
3451
3561
  state.result_all.back().text = std::move(text);
3452
3562
  state.result_all.back().t1 = token.t0;
3453
3563
  state.result_all.back().tokens.resize(i);
3564
+ state.result_all.back().speaker_turn_next = false;
3454
3565
 
3455
3566
  state.result_all.push_back({});
3456
3567
  state.result_all.back().t0 = token.t0;
@@ -3462,6 +3573,8 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
3462
3573
  segment.tokens.begin() + i,
3463
3574
  segment.tokens.end());
3464
3575
 
3576
+ state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
3577
+
3465
3578
  acc = 0;
3466
3579
  text = "";
3467
3580
 
@@ -3475,9 +3588,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
3475
3588
  }
3476
3589
  }
3477
3590
 
3478
- if (split_on_word) {
3479
- trim(text);
3480
- }
3481
3591
  state.result_all.back().text = std::move(text);
3482
3592
 
3483
3593
  return res;
@@ -3543,9 +3653,14 @@ static void whisper_process_logits(
3543
3653
  // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
3544
3654
  logits[vocab.token_not] = -INFINITY;
3545
3655
 
3546
- // suppress sot and solm tokens
3656
+ // suppress sot and nosp tokens
3547
3657
  logits[vocab.token_sot] = -INFINITY;
3548
- logits[vocab.token_solm] = -INFINITY;
3658
+ logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
3659
+
3660
+ // [TDRZ] when tinydiarize is disabled, suppress solm token
3661
+ if (params.tdrz_enable == false) {
3662
+ logits[vocab.token_solm] = -INFINITY;
3663
+ }
3549
3664
 
3550
3665
  // suppress task tokens
3551
3666
  logits[vocab.token_translate] = -INFINITY;
@@ -4042,9 +4157,9 @@ int whisper_full_with_state(
4042
4157
  state->lang_id = lang_id;
4043
4158
  prompt_init.push_back(whisper_token_lang(ctx, lang_id));
4044
4159
  if (params.translate) {
4045
- prompt_init.push_back(whisper_token_translate());
4160
+ prompt_init.push_back(whisper_token_translate(ctx));
4046
4161
  } else {
4047
- prompt_init.push_back(whisper_token_transcribe());
4162
+ prompt_init.push_back(whisper_token_transcribe(ctx));
4048
4163
  }
4049
4164
  }
4050
4165
 
@@ -4524,23 +4639,27 @@ int whisper_full_with_state(
4524
4639
  prompt_past.push_back(tokens_cur[i].id);
4525
4640
  }
4526
4641
 
4527
- // store the text from this iteration
4528
4642
  if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
4529
4643
  int i0 = 0;
4530
4644
  auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
4531
4645
 
4532
4646
  std::string text;
4647
+ bool speaker_turn_next = false;
4533
4648
 
4534
4649
  for (int i = 0; i < (int) tokens_cur.size(); i++) {
4535
4650
  //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
4536
4651
  // ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
4537
4652
  // ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
4538
4653
 
4539
- if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
4540
- } else {
4654
+ if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
4541
4655
  text += whisper_token_to_str(ctx, tokens_cur[i].id);
4542
4656
  }
4543
4657
 
4658
+ // [TDRZ] record if speaker turn was predicted after current segment
4659
+ if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
4660
+ speaker_turn_next = true;
4661
+ }
4662
+
4544
4663
  if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
4545
4664
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
4546
4665
 
@@ -4559,7 +4678,7 @@ int whisper_full_with_state(
4559
4678
 
4560
4679
  //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
4561
4680
 
4562
- result_all.push_back({ tt0, tt1, text, {} });
4681
+ result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
4563
4682
  for (int j = i0; j <= i; j++) {
4564
4683
  result_all.back().tokens.push_back(tokens_cur[j]);
4565
4684
  }
@@ -4585,6 +4704,7 @@ int whisper_full_with_state(
4585
4704
  i--;
4586
4705
  t0 = t1;
4587
4706
  i0 = i + 1;
4707
+ speaker_turn_next = false;
4588
4708
  }
4589
4709
  }
4590
4710
 
@@ -4603,7 +4723,7 @@ int whisper_full_with_state(
4603
4723
  }
4604
4724
  }
4605
4725
 
4606
- result_all.push_back({ tt0, tt1, text, {} });
4726
+ result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
4607
4727
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
4608
4728
  result_all.back().tokens.push_back(tokens_cur[j]);
4609
4729
  }
@@ -4783,6 +4903,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
4783
4903
  return ctx->state->result_all[i_segment].t1;
4784
4904
  }
4785
4905
 
4906
+ bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
4907
+ return ctx->state->result_all[i_segment].speaker_turn_next;
4908
+ }
4909
+
4786
4910
  const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
4787
4911
  return state->result_all[i_segment].text.c_str();
4788
4912
  }
package/cpp/whisper.h CHANGED
@@ -110,6 +110,23 @@ extern "C" {
110
110
 
111
111
  WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
112
112
 
113
+ // Given a context, enable use of OpenVINO for encode inference.
114
+ // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
115
+ // the path will be generated from the ggml model path that was passed
116
+ // in to whisper_init_from_file. For example, if 'path_model' was
117
+ // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
118
+ // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
119
+ // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
120
+ // cache_dir: Optional cache directory that can speed up init time, especially for
121
+ // GPU, by caching compiled 'blobs' there.
122
+ // Set to nullptr if not used.
123
+ // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
124
+ WHISPER_API int whisper_ctx_init_openvino_encoder(
125
+ struct whisper_context * ctx,
126
+ const char * model_path,
127
+ const char * device,
128
+ const char * cache_dir);
129
+
113
130
  // Frees all allocated memory
114
131
  WHISPER_API void whisper_free (struct whisper_context * ctx);
115
132
  WHISPER_API void whisper_free_state(struct whisper_state * state);
@@ -277,15 +294,16 @@ extern "C" {
277
294
  // Special tokens
278
295
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
279
296
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
280
- WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
281
297
  WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
298
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
299
+ WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
282
300
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
283
301
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
284
302
  WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
285
303
 
286
304
  // Task tokens
287
- WHISPER_API whisper_token whisper_token_translate (void);
288
- WHISPER_API whisper_token whisper_token_transcribe(void);
305
+ WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
306
+ WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
289
307
 
290
308
  // Performance information from the default state.
291
309
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
@@ -358,6 +376,9 @@ extern "C" {
358
376
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
359
377
  int audio_ctx; // overwrite the audio context size (0 = use default)
360
378
 
379
+ // [EXPERIMENTAL] [TDRZ] tinydiarize
380
+ bool tdrz_enable; // enable tinydiarize speaker turn detection
381
+
361
382
  // tokens to provide to the whisper decoder as initial prompt
362
383
  // these are prepended to any existing text context from a previous call
363
384
  const char * initial_prompt;
@@ -460,6 +481,9 @@ extern "C" {
460
481
  WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
461
482
  WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
462
483
 
484
+ // Get whether the next segment is predicted as a speaker turn
485
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
486
+
463
487
  // Get the text of the specified segment
464
488
  WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
465
489
  WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
@@ -488,9 +512,9 @@ extern "C" {
488
512
 
489
513
  // Temporary helpers needed for exposing ggml interface
490
514
 
491
- WHISPER_API int whisper_bench_memcpy(int n_threads);
492
- WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
493
- WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
515
+ WHISPER_API int whisper_bench_memcpy (int n_threads);
516
+ WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
517
+ WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
494
518
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
495
519
 
496
520
  #ifdef __cplusplus
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.rn",
3
- "version": "0.3.0-rc.6",
3
+ "version": "0.3.0",
4
4
  "description": "React Native binding of whisper.cpp",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",