whisper.rn 0.3.0-rc.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -9
- package/android/src/main/jni/whisper/Whisper.mk +11 -8
- package/cpp/ggml.c +4627 -1594
- package/cpp/ggml.h +427 -25
- package/cpp/whisper.cpp +226 -102
- package/cpp/whisper.h +30 -6
- package/package.json +1 -1
- package/whisper-rn.podspec +10 -6
package/cpp/whisper.cpp
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
#include "whisper.h"
|
|
2
|
-
#
|
|
2
|
+
#ifdef WHISPER_USE_COREML
|
|
3
3
|
#include "coreml/whisper-encoder.h"
|
|
4
4
|
#endif
|
|
5
5
|
|
|
6
|
+
#if WHISPER_USE_OPENVINO
|
|
7
|
+
#include "openvino/whisper-openvino-encoder.h"
|
|
8
|
+
#endif
|
|
9
|
+
|
|
6
10
|
#include "ggml.h"
|
|
7
11
|
|
|
8
12
|
#include <algorithm>
|
|
@@ -19,6 +23,10 @@
|
|
|
19
23
|
#include <regex>
|
|
20
24
|
#include <random>
|
|
21
25
|
|
|
26
|
+
#if defined(_MSC_VER)
|
|
27
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
28
|
+
#endif
|
|
29
|
+
|
|
22
30
|
#if defined(GGML_BIG_ENDIAN)
|
|
23
31
|
#include <bit>
|
|
24
32
|
|
|
@@ -376,16 +384,18 @@ struct whisper_vocab {
|
|
|
376
384
|
std::map<token, id> token_to_id;
|
|
377
385
|
std::map<id, token> id_to_token;
|
|
378
386
|
|
|
379
|
-
|
|
380
|
-
id
|
|
381
|
-
id
|
|
382
|
-
|
|
383
|
-
id
|
|
384
|
-
id
|
|
385
|
-
|
|
386
|
-
//
|
|
387
|
-
|
|
388
|
-
|
|
387
|
+
// reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
|
|
388
|
+
id token_eot = 50256;
|
|
389
|
+
id token_sot = 50257;
|
|
390
|
+
// task tokens (used only for multilingual models)
|
|
391
|
+
id token_translate = 50357;
|
|
392
|
+
id token_transcribe = 50358;
|
|
393
|
+
// other special tokens
|
|
394
|
+
id token_solm = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
|
|
395
|
+
id token_prev = 50360;
|
|
396
|
+
id token_nosp = 50361;
|
|
397
|
+
id token_not = 50362; // no timestamps
|
|
398
|
+
id token_beg = 50363; // begin timestamps
|
|
389
399
|
|
|
390
400
|
bool is_multilingual() const {
|
|
391
401
|
return n_vocab == 51865;
|
|
@@ -399,6 +409,8 @@ struct whisper_segment {
|
|
|
399
409
|
std::string text;
|
|
400
410
|
|
|
401
411
|
std::vector<whisper_token_data> tokens;
|
|
412
|
+
|
|
413
|
+
bool speaker_turn_next;
|
|
402
414
|
};
|
|
403
415
|
|
|
404
416
|
// medium
|
|
@@ -652,6 +664,10 @@ struct whisper_state {
|
|
|
652
664
|
whisper_coreml_context * ctx_coreml = nullptr;
|
|
653
665
|
#endif
|
|
654
666
|
|
|
667
|
+
#ifdef WHISPER_USE_OPENVINO
|
|
668
|
+
whisper_openvino_context * ctx_openvino = nullptr;
|
|
669
|
+
#endif
|
|
670
|
+
|
|
655
671
|
// [EXPERIMENTAL] token-level timestamps data
|
|
656
672
|
int64_t t_beg = 0;
|
|
657
673
|
int64_t t_last = 0;
|
|
@@ -808,7 +824,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
808
824
|
{
|
|
809
825
|
uint32_t magic;
|
|
810
826
|
read_safe(loader, magic);
|
|
811
|
-
if (magic !=
|
|
827
|
+
if (magic != GGML_FILE_MAGIC) {
|
|
812
828
|
fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
|
|
813
829
|
return false;
|
|
814
830
|
}
|
|
@@ -962,8 +978,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
962
978
|
if (vocab.is_multilingual()) {
|
|
963
979
|
vocab.token_eot++;
|
|
964
980
|
vocab.token_sot++;
|
|
965
|
-
vocab.
|
|
981
|
+
vocab.token_translate++;
|
|
982
|
+
vocab.token_transcribe++;
|
|
966
983
|
vocab.token_solm++;
|
|
984
|
+
vocab.token_prev++;
|
|
985
|
+
vocab.token_nosp++;
|
|
967
986
|
vocab.token_not++;
|
|
968
987
|
vocab.token_beg++;
|
|
969
988
|
}
|
|
@@ -977,8 +996,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
|
977
996
|
word = "[_EOT_]";
|
|
978
997
|
} else if (i == vocab.token_sot) {
|
|
979
998
|
word = "[_SOT_]";
|
|
999
|
+
} else if (i == vocab.token_solm) {
|
|
1000
|
+
word = "[_SOLM_]";
|
|
980
1001
|
} else if (i == vocab.token_prev) {
|
|
981
1002
|
word = "[_PREV_]";
|
|
1003
|
+
} else if (i == vocab.token_nosp) {
|
|
1004
|
+
word = "[_NOSP_]";
|
|
982
1005
|
} else if (i == vocab.token_not) {
|
|
983
1006
|
word = "[_NOT_]";
|
|
984
1007
|
} else if (i == vocab.token_beg) {
|
|
@@ -1463,12 +1486,18 @@ static bool whisper_encode_internal(
|
|
|
1463
1486
|
const bool use_coreml = wstate.ctx_coreml != nullptr;
|
|
1464
1487
|
#endif
|
|
1465
1488
|
|
|
1466
|
-
|
|
1489
|
+
#ifndef WHISPER_USE_OPENVINO
|
|
1490
|
+
const bool use_openvino = false;
|
|
1491
|
+
#else
|
|
1492
|
+
const bool use_openvino = wstate.ctx_openvino != nullptr;
|
|
1493
|
+
#endif
|
|
1494
|
+
|
|
1495
|
+
if (!use_coreml && !use_openvino) {
|
|
1467
1496
|
// convolution + gelu
|
|
1468
1497
|
{
|
|
1469
1498
|
wstate.use_buf(ctx0, 1);
|
|
1470
1499
|
|
|
1471
|
-
cur =
|
|
1500
|
+
cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
|
|
1472
1501
|
cur = ggml_add(ctx0,
|
|
1473
1502
|
ggml_repeat(ctx0,
|
|
1474
1503
|
model.e_conv_1_b,
|
|
@@ -1479,7 +1508,7 @@ static bool whisper_encode_internal(
|
|
|
1479
1508
|
|
|
1480
1509
|
wstate.use_buf(ctx0, 0);
|
|
1481
1510
|
|
|
1482
|
-
cur =
|
|
1511
|
+
cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
|
|
1483
1512
|
cur = ggml_add(ctx0,
|
|
1484
1513
|
ggml_repeat(ctx0,
|
|
1485
1514
|
model.e_conv_2_b,
|
|
@@ -1762,8 +1791,7 @@ static bool whisper_encode_internal(
|
|
|
1762
1791
|
}
|
|
1763
1792
|
}
|
|
1764
1793
|
#ifdef WHISPER_USE_COREML
|
|
1765
|
-
else
|
|
1766
|
-
{
|
|
1794
|
+
else if (use_coreml) {
|
|
1767
1795
|
wstate.use_buf(ctx0, -1);
|
|
1768
1796
|
|
|
1769
1797
|
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
|
@@ -1771,6 +1799,17 @@ static bool whisper_encode_internal(
|
|
|
1771
1799
|
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
|
1772
1800
|
}
|
|
1773
1801
|
#endif
|
|
1802
|
+
#ifdef WHISPER_USE_OPENVINO
|
|
1803
|
+
else if (use_openvino) {
|
|
1804
|
+
wstate.use_buf(ctx0, -1);
|
|
1805
|
+
|
|
1806
|
+
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
|
1807
|
+
|
|
1808
|
+
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
|
|
1809
|
+
return false;
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
#endif
|
|
1774
1813
|
|
|
1775
1814
|
// cur
|
|
1776
1815
|
//{
|
|
@@ -2613,6 +2652,31 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|
|
2613
2652
|
}
|
|
2614
2653
|
#endif
|
|
2615
2654
|
|
|
2655
|
+
#ifdef WHISPER_USE_OPENVINO
|
|
2656
|
+
// replace .bin with-encoder-openvino.xml
|
|
2657
|
+
static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
|
|
2658
|
+
auto pos = path_bin.rfind('.');
|
|
2659
|
+
if (pos != std::string::npos) {
|
|
2660
|
+
path_bin = path_bin.substr(0, pos);
|
|
2661
|
+
}
|
|
2662
|
+
|
|
2663
|
+
path_bin += "-encoder-openvino.xml";
|
|
2664
|
+
|
|
2665
|
+
return path_bin;
|
|
2666
|
+
}
|
|
2667
|
+
|
|
2668
|
+
static std::string whisper_openvino_get_path_cache(std::string path_bin) {
|
|
2669
|
+
auto pos = path_bin.rfind('.');
|
|
2670
|
+
if (pos != std::string::npos) {
|
|
2671
|
+
path_bin = path_bin.substr(0, pos);
|
|
2672
|
+
}
|
|
2673
|
+
|
|
2674
|
+
path_bin += "-encoder-openvino-cache";
|
|
2675
|
+
|
|
2676
|
+
return path_bin;
|
|
2677
|
+
}
|
|
2678
|
+
#endif
|
|
2679
|
+
|
|
2616
2680
|
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
2617
2681
|
whisper_state * state = new whisper_state;
|
|
2618
2682
|
|
|
@@ -2679,6 +2743,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
|
2679
2743
|
return state;
|
|
2680
2744
|
}
|
|
2681
2745
|
|
|
2746
|
+
int whisper_ctx_init_openvino_encoder(
|
|
2747
|
+
struct whisper_context * ctx,
|
|
2748
|
+
const char * model_path,
|
|
2749
|
+
const char * device,
|
|
2750
|
+
const char * cache_dir) {
|
|
2751
|
+
#ifndef WHISPER_USE_OPENVINO
|
|
2752
|
+
(void)(ctx);
|
|
2753
|
+
(void)(model_path);
|
|
2754
|
+
(void)(device);
|
|
2755
|
+
(void)(cache_dir);
|
|
2756
|
+
|
|
2757
|
+
return 1;
|
|
2758
|
+
#else
|
|
2759
|
+
if (!model_path && ctx->path_model.empty()) {
|
|
2760
|
+
fprintf(stderr, "%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
|
|
2761
|
+
return 1;
|
|
2762
|
+
}
|
|
2763
|
+
|
|
2764
|
+
std::string path_encoder;
|
|
2765
|
+
if (!model_path) {
|
|
2766
|
+
//if model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
|
|
2767
|
+
path_encoder = whisper_openvino_get_path_encoder(ctx->path_model);
|
|
2768
|
+
} else {
|
|
2769
|
+
path_encoder = model_path;
|
|
2770
|
+
}
|
|
2771
|
+
|
|
2772
|
+
std::string path_cache;
|
|
2773
|
+
if (!cache_dir) {
|
|
2774
|
+
//if cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
|
|
2775
|
+
path_cache = whisper_openvino_get_path_cache(ctx->path_model);
|
|
2776
|
+
} else {
|
|
2777
|
+
path_cache = cache_dir;
|
|
2778
|
+
}
|
|
2779
|
+
|
|
2780
|
+
fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
|
|
2781
|
+
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
|
|
2782
|
+
|
|
2783
|
+
ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
|
|
2784
|
+
if (!ctx->state->ctx_openvino) {
|
|
2785
|
+
fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
|
|
2786
|
+
return 1;
|
|
2787
|
+
} else {
|
|
2788
|
+
fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
|
|
2789
|
+
}
|
|
2790
|
+
|
|
2791
|
+
return 0;
|
|
2792
|
+
#endif
|
|
2793
|
+
}
|
|
2794
|
+
|
|
2682
2795
|
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
|
2683
2796
|
|
|
2684
2797
|
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
|
|
@@ -2833,6 +2946,13 @@ void whisper_free_state(struct whisper_state * state)
|
|
|
2833
2946
|
}
|
|
2834
2947
|
#endif
|
|
2835
2948
|
|
|
2949
|
+
#ifdef WHISPER_USE_OPENVINO
|
|
2950
|
+
if (state->ctx_openvino != nullptr) {
|
|
2951
|
+
whisper_openvino_free(state->ctx_openvino);
|
|
2952
|
+
state->ctx_openvino = nullptr;
|
|
2953
|
+
}
|
|
2954
|
+
#endif
|
|
2955
|
+
|
|
2836
2956
|
delete state;
|
|
2837
2957
|
}
|
|
2838
2958
|
}
|
|
@@ -3204,12 +3324,16 @@ whisper_token whisper_token_sot(struct whisper_context * ctx) {
|
|
|
3204
3324
|
return ctx->vocab.token_sot;
|
|
3205
3325
|
}
|
|
3206
3326
|
|
|
3327
|
+
whisper_token whisper_token_solm(struct whisper_context * ctx) {
|
|
3328
|
+
return ctx->vocab.token_solm;
|
|
3329
|
+
}
|
|
3330
|
+
|
|
3207
3331
|
whisper_token whisper_token_prev(struct whisper_context * ctx) {
|
|
3208
3332
|
return ctx->vocab.token_prev;
|
|
3209
3333
|
}
|
|
3210
3334
|
|
|
3211
|
-
whisper_token
|
|
3212
|
-
return ctx->vocab.
|
|
3335
|
+
whisper_token whisper_token_nosp(struct whisper_context * ctx) {
|
|
3336
|
+
return ctx->vocab.token_nosp;
|
|
3213
3337
|
}
|
|
3214
3338
|
|
|
3215
3339
|
whisper_token whisper_token_not(struct whisper_context * ctx) {
|
|
@@ -3224,12 +3348,12 @@ whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
|
|
|
3224
3348
|
return whisper_token_sot(ctx) + 1 + lang_id;
|
|
3225
3349
|
}
|
|
3226
3350
|
|
|
3227
|
-
whisper_token whisper_token_translate(
|
|
3228
|
-
return
|
|
3351
|
+
whisper_token whisper_token_translate(struct whisper_context * ctx) {
|
|
3352
|
+
return ctx->vocab.token_translate;
|
|
3229
3353
|
}
|
|
3230
3354
|
|
|
3231
|
-
whisper_token whisper_token_transcribe(
|
|
3232
|
-
return
|
|
3355
|
+
whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
|
|
3356
|
+
return ctx->vocab.token_transcribe;
|
|
3233
3357
|
}
|
|
3234
3358
|
|
|
3235
3359
|
void whisper_print_timings(struct whisper_context * ctx) {
|
|
@@ -3268,6 +3392,14 @@ static int whisper_has_coreml(void) {
|
|
|
3268
3392
|
#endif
|
|
3269
3393
|
}
|
|
3270
3394
|
|
|
3395
|
+
static int whisper_has_openvino(void) {
|
|
3396
|
+
#ifdef WHISPER_USE_OPENVINO
|
|
3397
|
+
return 1;
|
|
3398
|
+
#else
|
|
3399
|
+
return 0;
|
|
3400
|
+
#endif
|
|
3401
|
+
}
|
|
3402
|
+
|
|
3271
3403
|
const char * whisper_print_system_info(void) {
|
|
3272
3404
|
static std::string s;
|
|
3273
3405
|
|
|
@@ -3285,6 +3417,7 @@ const char * whisper_print_system_info(void) {
|
|
|
3285
3417
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
|
3286
3418
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
3287
3419
|
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
|
3420
|
+
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
|
3288
3421
|
|
|
3289
3422
|
return s.c_str();
|
|
3290
3423
|
}
|
|
@@ -3301,51 +3434,53 @@ struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sam
|
|
|
3301
3434
|
|
|
3302
3435
|
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
|
3303
3436
|
struct whisper_full_params result = {
|
|
3304
|
-
/*.strategy
|
|
3305
|
-
|
|
3306
|
-
/*.n_threads
|
|
3307
|
-
/*.n_max_text_ctx
|
|
3308
|
-
/*.offset_ms
|
|
3309
|
-
/*.duration_ms
|
|
3310
|
-
|
|
3311
|
-
/*.translate
|
|
3312
|
-
/*.no_context
|
|
3313
|
-
/*.single_segment
|
|
3314
|
-
/*.print_special
|
|
3315
|
-
/*.print_progress
|
|
3316
|
-
/*.print_realtime
|
|
3317
|
-
/*.print_timestamps
|
|
3318
|
-
|
|
3319
|
-
/*.token_timestamps
|
|
3320
|
-
/*.thold_pt
|
|
3321
|
-
/*.thold_ptsum
|
|
3322
|
-
/*.max_len
|
|
3323
|
-
/*.split_on_word
|
|
3324
|
-
/*.max_tokens
|
|
3325
|
-
|
|
3326
|
-
/*.speed_up
|
|
3327
|
-
/*.audio_ctx
|
|
3328
|
-
|
|
3329
|
-
/*.
|
|
3330
|
-
|
|
3331
|
-
/*.
|
|
3332
|
-
|
|
3333
|
-
/*.
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
/*.
|
|
3437
|
+
/*.strategy =*/ strategy,
|
|
3438
|
+
|
|
3439
|
+
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
|
3440
|
+
/*.n_max_text_ctx =*/ 16384,
|
|
3441
|
+
/*.offset_ms =*/ 0,
|
|
3442
|
+
/*.duration_ms =*/ 0,
|
|
3443
|
+
|
|
3444
|
+
/*.translate =*/ false,
|
|
3445
|
+
/*.no_context =*/ true,
|
|
3446
|
+
/*.single_segment =*/ false,
|
|
3447
|
+
/*.print_special =*/ false,
|
|
3448
|
+
/*.print_progress =*/ true,
|
|
3449
|
+
/*.print_realtime =*/ false,
|
|
3450
|
+
/*.print_timestamps =*/ true,
|
|
3451
|
+
|
|
3452
|
+
/*.token_timestamps =*/ false,
|
|
3453
|
+
/*.thold_pt =*/ 0.01f,
|
|
3454
|
+
/*.thold_ptsum =*/ 0.01f,
|
|
3455
|
+
/*.max_len =*/ 0,
|
|
3456
|
+
/*.split_on_word =*/ false,
|
|
3457
|
+
/*.max_tokens =*/ 0,
|
|
3458
|
+
|
|
3459
|
+
/*.speed_up =*/ false,
|
|
3460
|
+
/*.audio_ctx =*/ 0,
|
|
3461
|
+
|
|
3462
|
+
/*.tdrz_enable =*/ false,
|
|
3463
|
+
|
|
3464
|
+
/*.initial_prompt =*/ nullptr,
|
|
3465
|
+
/*.prompt_tokens =*/ nullptr,
|
|
3466
|
+
/*.prompt_n_tokens =*/ 0,
|
|
3467
|
+
|
|
3468
|
+
/*.language =*/ "en",
|
|
3469
|
+
/*.detect_language =*/ false,
|
|
3470
|
+
|
|
3471
|
+
/*.suppress_blank =*/ true,
|
|
3337
3472
|
/*.suppress_non_speech_tokens =*/ false,
|
|
3338
3473
|
|
|
3339
|
-
/*.temperature
|
|
3340
|
-
/*.max_initial_ts
|
|
3341
|
-
/*.length_penalty
|
|
3474
|
+
/*.temperature =*/ 0.0f,
|
|
3475
|
+
/*.max_initial_ts =*/ 1.0f,
|
|
3476
|
+
/*.length_penalty =*/ -1.0f,
|
|
3342
3477
|
|
|
3343
|
-
/*.temperature_inc
|
|
3344
|
-
/*.entropy_thold
|
|
3345
|
-
/*.logprob_thold
|
|
3346
|
-
/*.no_speech_thold
|
|
3478
|
+
/*.temperature_inc =*/ 0.4f,
|
|
3479
|
+
/*.entropy_thold =*/ 2.4f,
|
|
3480
|
+
/*.logprob_thold =*/ -1.0f,
|
|
3481
|
+
/*.no_speech_thold =*/ 0.6f,
|
|
3347
3482
|
|
|
3348
|
-
/*.greedy
|
|
3483
|
+
/*.greedy =*/ {
|
|
3349
3484
|
/*.best_of =*/ -1,
|
|
3350
3485
|
},
|
|
3351
3486
|
|
|
@@ -3397,26 +3532,6 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
|
3397
3532
|
float thold_pt,
|
|
3398
3533
|
float thold_ptsum);
|
|
3399
3534
|
|
|
3400
|
-
// trim from start (in place)
|
|
3401
|
-
static inline void ltrim(std::string &s) {
|
|
3402
|
-
s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
|
|
3403
|
-
return std::isspace(ch);
|
|
3404
|
-
}));
|
|
3405
|
-
}
|
|
3406
|
-
|
|
3407
|
-
// trim from end (in place)
|
|
3408
|
-
static inline void rtrim(std::string &s) {
|
|
3409
|
-
s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
|
|
3410
|
-
return std::isspace(ch);
|
|
3411
|
-
}).base(), s.end());
|
|
3412
|
-
}
|
|
3413
|
-
|
|
3414
|
-
// trim from both ends (in place)
|
|
3415
|
-
static inline void trim(std::string &s) {
|
|
3416
|
-
rtrim(s);
|
|
3417
|
-
ltrim(s);
|
|
3418
|
-
}
|
|
3419
|
-
|
|
3420
3535
|
static inline bool should_split_on_word(const char * txt, bool split_on_word) {
|
|
3421
3536
|
if (!split_on_word) return true;
|
|
3422
3537
|
|
|
@@ -3443,14 +3558,10 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|
|
3443
3558
|
const int cur = strlen(txt);
|
|
3444
3559
|
|
|
3445
3560
|
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
|
|
3446
|
-
// split here
|
|
3447
|
-
if (split_on_word) {
|
|
3448
|
-
trim(text);
|
|
3449
|
-
}
|
|
3450
|
-
|
|
3451
3561
|
state.result_all.back().text = std::move(text);
|
|
3452
3562
|
state.result_all.back().t1 = token.t0;
|
|
3453
3563
|
state.result_all.back().tokens.resize(i);
|
|
3564
|
+
state.result_all.back().speaker_turn_next = false;
|
|
3454
3565
|
|
|
3455
3566
|
state.result_all.push_back({});
|
|
3456
3567
|
state.result_all.back().t0 = token.t0;
|
|
@@ -3462,6 +3573,8 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|
|
3462
3573
|
segment.tokens.begin() + i,
|
|
3463
3574
|
segment.tokens.end());
|
|
3464
3575
|
|
|
3576
|
+
state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
|
|
3577
|
+
|
|
3465
3578
|
acc = 0;
|
|
3466
3579
|
text = "";
|
|
3467
3580
|
|
|
@@ -3475,9 +3588,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|
|
3475
3588
|
}
|
|
3476
3589
|
}
|
|
3477
3590
|
|
|
3478
|
-
if (split_on_word) {
|
|
3479
|
-
trim(text);
|
|
3480
|
-
}
|
|
3481
3591
|
state.result_all.back().text = std::move(text);
|
|
3482
3592
|
|
|
3483
3593
|
return res;
|
|
@@ -3543,9 +3653,14 @@ static void whisper_process_logits(
|
|
|
3543
3653
|
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
|
|
3544
3654
|
logits[vocab.token_not] = -INFINITY;
|
|
3545
3655
|
|
|
3546
|
-
// suppress sot and
|
|
3656
|
+
// suppress sot and nosp tokens
|
|
3547
3657
|
logits[vocab.token_sot] = -INFINITY;
|
|
3548
|
-
logits[vocab.
|
|
3658
|
+
logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
|
|
3659
|
+
|
|
3660
|
+
// [TDRZ] when tinydiarize is disabled, suppress solm token
|
|
3661
|
+
if (params.tdrz_enable == false) {
|
|
3662
|
+
logits[vocab.token_solm] = -INFINITY;
|
|
3663
|
+
}
|
|
3549
3664
|
|
|
3550
3665
|
// suppress task tokens
|
|
3551
3666
|
logits[vocab.token_translate] = -INFINITY;
|
|
@@ -4042,9 +4157,9 @@ int whisper_full_with_state(
|
|
|
4042
4157
|
state->lang_id = lang_id;
|
|
4043
4158
|
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
|
4044
4159
|
if (params.translate) {
|
|
4045
|
-
prompt_init.push_back(whisper_token_translate());
|
|
4160
|
+
prompt_init.push_back(whisper_token_translate(ctx));
|
|
4046
4161
|
} else {
|
|
4047
|
-
prompt_init.push_back(whisper_token_transcribe());
|
|
4162
|
+
prompt_init.push_back(whisper_token_transcribe(ctx));
|
|
4048
4163
|
}
|
|
4049
4164
|
}
|
|
4050
4165
|
|
|
@@ -4524,23 +4639,27 @@ int whisper_full_with_state(
|
|
|
4524
4639
|
prompt_past.push_back(tokens_cur[i].id);
|
|
4525
4640
|
}
|
|
4526
4641
|
|
|
4527
|
-
// store the text from this iteration
|
|
4528
4642
|
if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
|
|
4529
4643
|
int i0 = 0;
|
|
4530
4644
|
auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
|
|
4531
4645
|
|
|
4532
4646
|
std::string text;
|
|
4647
|
+
bool speaker_turn_next = false;
|
|
4533
4648
|
|
|
4534
4649
|
for (int i = 0; i < (int) tokens_cur.size(); i++) {
|
|
4535
4650
|
//printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
|
|
4536
4651
|
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
|
4537
4652
|
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
|
4538
4653
|
|
|
4539
|
-
if (params.print_special
|
|
4540
|
-
} else {
|
|
4654
|
+
if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
|
|
4541
4655
|
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
|
4542
4656
|
}
|
|
4543
4657
|
|
|
4658
|
+
// [TDRZ] record if speaker turn was predicted after current segment
|
|
4659
|
+
if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
|
|
4660
|
+
speaker_turn_next = true;
|
|
4661
|
+
}
|
|
4662
|
+
|
|
4544
4663
|
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
|
4545
4664
|
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
|
4546
4665
|
|
|
@@ -4559,7 +4678,7 @@ int whisper_full_with_state(
|
|
|
4559
4678
|
|
|
4560
4679
|
//printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
|
|
4561
4680
|
|
|
4562
|
-
result_all.push_back({ tt0, tt1, text, {} });
|
|
4681
|
+
result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
|
|
4563
4682
|
for (int j = i0; j <= i; j++) {
|
|
4564
4683
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
4565
4684
|
}
|
|
@@ -4585,6 +4704,7 @@ int whisper_full_with_state(
|
|
|
4585
4704
|
i--;
|
|
4586
4705
|
t0 = t1;
|
|
4587
4706
|
i0 = i + 1;
|
|
4707
|
+
speaker_turn_next = false;
|
|
4588
4708
|
}
|
|
4589
4709
|
}
|
|
4590
4710
|
|
|
@@ -4603,7 +4723,7 @@ int whisper_full_with_state(
|
|
|
4603
4723
|
}
|
|
4604
4724
|
}
|
|
4605
4725
|
|
|
4606
|
-
result_all.push_back({ tt0, tt1, text, {} });
|
|
4726
|
+
result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
|
|
4607
4727
|
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
|
4608
4728
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
4609
4729
|
}
|
|
@@ -4783,6 +4903,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
|
|
|
4783
4903
|
return ctx->state->result_all[i_segment].t1;
|
|
4784
4904
|
}
|
|
4785
4905
|
|
|
4906
|
+
bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
|
|
4907
|
+
return ctx->state->result_all[i_segment].speaker_turn_next;
|
|
4908
|
+
}
|
|
4909
|
+
|
|
4786
4910
|
const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
|
|
4787
4911
|
return state->result_all[i_segment].text.c_str();
|
|
4788
4912
|
}
|
package/cpp/whisper.h
CHANGED
|
@@ -110,6 +110,23 @@ extern "C" {
|
|
|
110
110
|
|
|
111
111
|
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
|
112
112
|
|
|
113
|
+
// Given a context, enable use of OpenVINO for encode inference.
|
|
114
|
+
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
|
|
115
|
+
// the path will be generated from the ggml model path that was passed
|
|
116
|
+
// in to whisper_init_from_file. For example, if 'path_model' was
|
|
117
|
+
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
|
|
118
|
+
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
|
|
119
|
+
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
|
|
120
|
+
// cache_dir: Optional cache directory that can speed up init time, especially for
|
|
121
|
+
// GPU, by caching compiled 'blobs' there.
|
|
122
|
+
// Set to nullptr if not used.
|
|
123
|
+
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
|
124
|
+
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
|
125
|
+
struct whisper_context * ctx,
|
|
126
|
+
const char * model_path,
|
|
127
|
+
const char * device,
|
|
128
|
+
const char * cache_dir);
|
|
129
|
+
|
|
113
130
|
// Frees all allocated memory
|
|
114
131
|
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
|
115
132
|
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
|
@@ -277,15 +294,16 @@ extern "C" {
|
|
|
277
294
|
// Special tokens
|
|
278
295
|
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
|
279
296
|
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
|
280
|
-
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
|
281
297
|
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
|
298
|
+
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
|
299
|
+
WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
|
|
282
300
|
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
|
283
301
|
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
|
284
302
|
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
|
285
303
|
|
|
286
304
|
// Task tokens
|
|
287
|
-
WHISPER_API whisper_token whisper_token_translate (
|
|
288
|
-
WHISPER_API whisper_token whisper_token_transcribe(
|
|
305
|
+
WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
|
|
306
|
+
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
|
289
307
|
|
|
290
308
|
// Performance information from the default state.
|
|
291
309
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
|
@@ -358,6 +376,9 @@ extern "C" {
|
|
|
358
376
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
|
359
377
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
|
360
378
|
|
|
379
|
+
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
|
380
|
+
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
|
381
|
+
|
|
361
382
|
// tokens to provide to the whisper decoder as initial prompt
|
|
362
383
|
// these are prepended to any existing text context from a previous call
|
|
363
384
|
const char * initial_prompt;
|
|
@@ -460,6 +481,9 @@ extern "C" {
|
|
|
460
481
|
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
|
461
482
|
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
|
462
483
|
|
|
484
|
+
// Get whether the next segment is predicted as a speaker turn
|
|
485
|
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
|
|
486
|
+
|
|
463
487
|
// Get the text of the specified segment
|
|
464
488
|
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
|
465
489
|
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
|
@@ -488,9 +512,9 @@ extern "C" {
|
|
|
488
512
|
|
|
489
513
|
// Temporary helpers needed for exposing ggml interface
|
|
490
514
|
|
|
491
|
-
WHISPER_API int
|
|
492
|
-
WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
|
|
493
|
-
WHISPER_API int
|
|
515
|
+
WHISPER_API int whisper_bench_memcpy (int n_threads);
|
|
516
|
+
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
|
|
517
|
+
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
|
494
518
|
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
|
495
519
|
|
|
496
520
|
#ifdef __cplusplus
|