whisper.rn 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -2
- package/android/build.gradle +12 -2
- package/android/src/main/CMakeLists.txt +55 -0
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +328 -0
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +13 -2
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -228
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +16 -222
- package/cpp/README.md +4 -0
- package/cpp/coreml/whisper-encoder.mm +4 -2
- package/cpp/ggml.c +9 -1
- package/cpp/ggml.h +1 -0
- package/cpp/whisper.cpp +151 -99
- package/cpp/whisper.h +2 -1
- package/ios/RNWhisper.mm +36 -30
- package/ios/RNWhisperContext.h +8 -3
- package/ios/RNWhisperContext.mm +26 -13
- package/lib/commonjs/index.js +2 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +2 -2
- package/lib/module/index.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +2 -2
- package/android/src/main/jni/whisper/Android.mk +0 -26
- package/android/src/main/jni/whisper/Application.mk +0 -1
- package/android/src/main/jni/whisper/Whisper.mk +0 -22
- /package/android/src/main/{jni/whisper/jni.cpp → jni.cpp} +0 -0
package/cpp/whisper.cpp
CHANGED
|
@@ -82,7 +82,7 @@ static void byteswap_tensor(wsp_ggml_tensor * tensor) {
|
|
|
82
82
|
} while (0)
|
|
83
83
|
#define BYTESWAP_TENSOR(t) \
|
|
84
84
|
do { \
|
|
85
|
-
byteswap_tensor(
|
|
85
|
+
byteswap_tensor(t); \
|
|
86
86
|
} while (0)
|
|
87
87
|
#else
|
|
88
88
|
#define BYTESWAP_VALUE(d) do {} while (0)
|
|
@@ -589,7 +589,7 @@ struct whisper_model {
|
|
|
589
589
|
struct whisper_sequence {
|
|
590
590
|
std::vector<whisper_token_data> tokens;
|
|
591
591
|
|
|
592
|
-
// the accumulated transcription in the current
|
|
592
|
+
// the accumulated transcription in the current iteration (used to truncate the tokens array)
|
|
593
593
|
int result_len;
|
|
594
594
|
|
|
595
595
|
double sum_logprobs_all; // the sum of the log probabilities of the tokens
|
|
@@ -2347,6 +2347,23 @@ static std::string to_timestamp(int64_t t, bool comma = false) {
|
|
|
2347
2347
|
return std::string(buf);
|
|
2348
2348
|
}
|
|
2349
2349
|
|
|
2350
|
+
#define SIN_COS_N_COUNT WHISPER_N_FFT
|
|
2351
|
+
static float sin_vals[SIN_COS_N_COUNT];
|
|
2352
|
+
static float cos_vals[SIN_COS_N_COUNT];
|
|
2353
|
+
|
|
2354
|
+
// In FFT, we frequently use sine and cosine operations with the same values.
|
|
2355
|
+
// We can use precalculated values to speed up the process.
|
|
2356
|
+
static void fill_sin_cos_table() {
|
|
2357
|
+
static bool is_filled = false;
|
|
2358
|
+
if (is_filled) return;
|
|
2359
|
+
for (int i = 0; i < SIN_COS_N_COUNT; i++) {
|
|
2360
|
+
double theta = (2*M_PI*i)/SIN_COS_N_COUNT;
|
|
2361
|
+
sin_vals[i] = sinf(theta);
|
|
2362
|
+
cos_vals[i] = cosf(theta);
|
|
2363
|
+
}
|
|
2364
|
+
is_filled = true;
|
|
2365
|
+
}
|
|
2366
|
+
|
|
2350
2367
|
// naive Discrete Fourier Transform
|
|
2351
2368
|
// input is real-valued
|
|
2352
2369
|
// output is complex-valued
|
|
@@ -2354,15 +2371,16 @@ static void dft(const std::vector<float> & in, std::vector<float> & out) {
|
|
|
2354
2371
|
int N = in.size();
|
|
2355
2372
|
|
|
2356
2373
|
out.resize(N*2);
|
|
2374
|
+
const int sin_cos_step = SIN_COS_N_COUNT / N;
|
|
2357
2375
|
|
|
2358
2376
|
for (int k = 0; k < N; k++) {
|
|
2359
2377
|
float re = 0;
|
|
2360
2378
|
float im = 0;
|
|
2361
2379
|
|
|
2362
2380
|
for (int n = 0; n < N; n++) {
|
|
2363
|
-
|
|
2364
|
-
re += in[n]*cos(
|
|
2365
|
-
im -= in[n]*sin(
|
|
2381
|
+
int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
|
|
2382
|
+
re += in[n]*cos_vals[idx]; // cos(t)
|
|
2383
|
+
im -= in[n]*sin_vals[idx]; // sin(t)
|
|
2366
2384
|
}
|
|
2367
2385
|
|
|
2368
2386
|
out[k*2 + 0] = re;
|
|
@@ -2410,11 +2428,11 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
|
|
|
2410
2428
|
fft(even, even_fft);
|
|
2411
2429
|
fft(odd, odd_fft);
|
|
2412
2430
|
|
|
2431
|
+
const int sin_cos_step = SIN_COS_N_COUNT / N;
|
|
2413
2432
|
for (int k = 0; k < N/2; k++) {
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
float
|
|
2417
|
-
float im = -sin(theta);
|
|
2433
|
+
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
|
|
2434
|
+
float re = cos_vals[idx]; // cos(t)
|
|
2435
|
+
float im = -sin_vals[idx]; // sin(t)
|
|
2418
2436
|
|
|
2419
2437
|
float re_odd = odd_fft[2*k + 0];
|
|
2420
2438
|
float im_odd = odd_fft[2*k + 1];
|
|
@@ -2427,40 +2445,50 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
|
|
|
2427
2445
|
}
|
|
2428
2446
|
}
|
|
2429
2447
|
|
|
2430
|
-
static
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2448
|
+
static bool hann_window(int length, bool periodic, std::vector<float> & output) {
|
|
2449
|
+
if (output.size() < length) {
|
|
2450
|
+
output.resize(length);
|
|
2451
|
+
}
|
|
2452
|
+
int offset = -1;
|
|
2453
|
+
if (periodic) {
|
|
2454
|
+
offset = 0;
|
|
2455
|
+
}
|
|
2456
|
+
for (int i = 0; i < length; i++) {
|
|
2457
|
+
output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset)));
|
|
2458
|
+
}
|
|
2436
2459
|
|
|
2437
|
-
|
|
2438
|
-
|
|
2460
|
+
return true;
|
|
2461
|
+
}
|
|
2439
2462
|
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2463
|
+
static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> & hann, const std::vector<float> & samples,
|
|
2464
|
+
int n_samples, int frame_size, int frame_step, int n_threads,
|
|
2465
|
+
const whisper_filters & filters, whisper_mel & mel) {
|
|
2466
|
+
std::vector<float> fft_in(frame_size, 0.0);
|
|
2467
|
+
std::vector<float> fft_out(2 * frame_step);
|
|
2468
|
+
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
|
|
2469
|
+
int n_fft = 1 + (frame_size / 2);
|
|
2470
|
+
int i = ith;
|
|
2448
2471
|
|
|
2449
|
-
|
|
2450
|
-
|
|
2472
|
+
// calculate FFT only when fft_in are not all zero
|
|
2473
|
+
for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
|
|
2474
|
+
const int offset = i * frame_step;
|
|
2451
2475
|
|
|
2452
|
-
|
|
2453
|
-
|
|
2476
|
+
// apply Hanning window (~10% faster)
|
|
2477
|
+
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
|
|
2478
|
+
fft_in[j] = hann[j] * samples[offset + j];
|
|
2454
2479
|
}
|
|
2455
|
-
|
|
2456
|
-
|
|
2480
|
+
// fill the rest with zeros
|
|
2481
|
+
if (n_samples - offset < frame_size) {
|
|
2482
|
+
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
|
|
2457
2483
|
}
|
|
2458
2484
|
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2485
|
+
// FFT
|
|
2486
|
+
fft(fft_in, fft_out);
|
|
2487
|
+
|
|
2488
|
+
// Calculate modulus^2 of complex numbers
|
|
2489
|
+
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
|
|
2490
|
+
for (int j = 0; j < frame_size; j++) {
|
|
2491
|
+
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
|
2464
2492
|
}
|
|
2465
2493
|
|
|
2466
2494
|
// mel spectrogram
|
|
@@ -2471,10 +2499,10 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
|
|
|
2471
2499
|
int k = 0;
|
|
2472
2500
|
for (k = 0; k < n_fft - 3; k += 4) {
|
|
2473
2501
|
sum +=
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2502
|
+
fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
|
|
2503
|
+
fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
|
|
2504
|
+
fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
|
|
2505
|
+
fft_out[k + 3] * filters.data[j * n_fft + k + 3];
|
|
2478
2506
|
}
|
|
2479
2507
|
|
|
2480
2508
|
// handle n_fft remainder
|
|
@@ -2487,68 +2515,73 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
|
|
|
2487
2515
|
mel.data[j * mel.n_len + i] = sum;
|
|
2488
2516
|
}
|
|
2489
2517
|
}
|
|
2518
|
+
|
|
2519
|
+
// Otherwise fft_out are all zero
|
|
2520
|
+
double sum = log10(1e-10);
|
|
2521
|
+
for (; i < mel.n_len; i += n_threads) {
|
|
2522
|
+
for (int j = 0; j < mel.n_mel; j++) {
|
|
2523
|
+
mel.data[j * mel.n_len + i] = sum;
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2490
2526
|
}
|
|
2491
2527
|
|
|
2492
|
-
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#
|
|
2528
|
+
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
|
|
2493
2529
|
static bool log_mel_spectrogram(
|
|
2494
|
-
|
|
2495
|
-
|
|
2530
|
+
whisper_state & wstate,
|
|
2531
|
+
const float * samples,
|
|
2496
2532
|
const int n_samples,
|
|
2497
2533
|
const int /*sample_rate*/,
|
|
2498
|
-
const int
|
|
2499
|
-
const int
|
|
2534
|
+
const int frame_size,
|
|
2535
|
+
const int frame_step,
|
|
2500
2536
|
const int n_mel,
|
|
2501
2537
|
const int n_threads,
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2538
|
+
const whisper_filters & filters,
|
|
2539
|
+
const bool debug,
|
|
2540
|
+
whisper_mel & mel) {
|
|
2505
2541
|
const int64_t t_start_us = wsp_ggml_time_us();
|
|
2506
2542
|
|
|
2507
|
-
// Hanning window
|
|
2543
|
+
// Hanning window (Use cosf to eliminate difference)
|
|
2544
|
+
// ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
|
|
2545
|
+
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
|
|
2508
2546
|
std::vector<float> hann;
|
|
2509
|
-
hann
|
|
2510
|
-
for (int i = 0; i < fft_size; i++) {
|
|
2511
|
-
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
|
|
2512
|
-
}
|
|
2547
|
+
hann_window(frame_size, true, hann);
|
|
2513
2548
|
|
|
2514
|
-
mel.n_mel = n_mel;
|
|
2515
|
-
mel.n_len = n_samples/fft_step;
|
|
2516
|
-
mel.n_len_org = mel.n_len;
|
|
2517
2549
|
|
|
2518
|
-
|
|
2550
|
+
// Calculate the length of padding
|
|
2551
|
+
int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
|
|
2552
|
+
int64_t stage_2_pad = frame_size / 2;
|
|
2519
2553
|
|
|
2520
|
-
//
|
|
2521
|
-
|
|
2522
|
-
|
|
2554
|
+
// Initialize a vector and copy data from C array to it.
|
|
2555
|
+
std::vector<float> samples_padded;
|
|
2556
|
+
samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
|
|
2557
|
+
std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
|
|
2523
2558
|
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
}
|
|
2527
|
-
mel.n_len += pad;
|
|
2559
|
+
// pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
|
|
2560
|
+
std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
|
|
2528
2561
|
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
|
|
2562
|
+
// reflective pad 200 samples at the beginning of audio
|
|
2563
|
+
std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
|
|
2532
2564
|
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2536
|
-
mel.
|
|
2565
|
+
mel.n_mel = n_mel;
|
|
2566
|
+
// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
|
|
2567
|
+
// Calculate number of frames + remove the last frame
|
|
2568
|
+
mel.n_len = (samples_padded.size() - frame_size) / frame_step;
|
|
2569
|
+
// Calculate semi-padded sample length to ensure compatibility
|
|
2570
|
+
mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
|
|
2571
|
+
mel.data.resize(mel.n_mel * mel.n_len);
|
|
2537
2572
|
|
|
2538
|
-
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
|
2539
|
-
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
|
2540
2573
|
|
|
2541
2574
|
{
|
|
2542
2575
|
std::vector<std::thread> workers(n_threads - 1);
|
|
2543
2576
|
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
|
2544
2577
|
workers[iw] = std::thread(
|
|
2545
|
-
log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann),
|
|
2546
|
-
n_samples,
|
|
2547
|
-
std::cref(filters),
|
|
2578
|
+
log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded,
|
|
2579
|
+
n_samples + stage_2_pad, frame_size, frame_step, n_threads,
|
|
2580
|
+
std::cref(filters), std::ref(mel));
|
|
2548
2581
|
}
|
|
2549
2582
|
|
|
2550
2583
|
// main thread
|
|
2551
|
-
log_mel_spectrogram_worker_thread(0, hann,
|
|
2584
|
+
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
|
|
2552
2585
|
|
|
2553
2586
|
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
|
2554
2587
|
workers[iw].join();
|
|
@@ -2562,7 +2595,6 @@ static bool log_mel_spectrogram(
|
|
|
2562
2595
|
mmax = mel.data[i];
|
|
2563
2596
|
}
|
|
2564
2597
|
}
|
|
2565
|
-
//printf("%s: max = %f\n", __func__, mmax);
|
|
2566
2598
|
|
|
2567
2599
|
mmax -= 8.0;
|
|
2568
2600
|
|
|
@@ -2576,7 +2608,16 @@ static bool log_mel_spectrogram(
|
|
|
2576
2608
|
|
|
2577
2609
|
wstate.t_mel_us += wsp_ggml_time_us() - t_start_us;
|
|
2578
2610
|
|
|
2579
|
-
//
|
|
2611
|
+
// Dump log_mel_spectrogram
|
|
2612
|
+
if (debug) {
|
|
2613
|
+
std::ofstream outFile("log_mel_spectrogram.json");
|
|
2614
|
+
outFile << "[";
|
|
2615
|
+
for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
|
|
2616
|
+
outFile << mel.data[i] << ", ";
|
|
2617
|
+
}
|
|
2618
|
+
outFile << mel.data[mel.data.size() - 1] << "]";
|
|
2619
|
+
outFile.close();
|
|
2620
|
+
}
|
|
2580
2621
|
|
|
2581
2622
|
return true;
|
|
2582
2623
|
}
|
|
@@ -2694,6 +2735,7 @@ static std::string whisper_openvino_get_path_cache(std::string path_bin) {
|
|
|
2694
2735
|
#endif
|
|
2695
2736
|
|
|
2696
2737
|
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
2738
|
+
fill_sin_cos_table();
|
|
2697
2739
|
whisper_state * state = new whisper_state;
|
|
2698
2740
|
|
|
2699
2741
|
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
|
|
@@ -3007,9 +3049,9 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
|
|
|
3007
3049
|
return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
|
|
3008
3050
|
}
|
|
3009
3051
|
|
|
3010
|
-
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
|
3052
|
+
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
|
3011
3053
|
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
|
3012
|
-
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters,
|
|
3054
|
+
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
|
|
3013
3055
|
log("%s: failed to compute mel spectrogram\n", __func__);
|
|
3014
3056
|
return -1;
|
|
3015
3057
|
}
|
|
@@ -3017,11 +3059,20 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st
|
|
|
3017
3059
|
return 0;
|
|
3018
3060
|
}
|
|
3019
3061
|
|
|
3020
|
-
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
|
3062
|
+
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
|
3021
3063
|
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
|
3022
3064
|
return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
|
|
3023
3065
|
}
|
|
3024
3066
|
|
|
3067
|
+
// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
|
|
3068
|
+
// TODO
|
|
3069
|
+
|
|
3070
|
+
// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
|
|
3071
|
+
// TODO
|
|
3072
|
+
|
|
3073
|
+
// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
|
|
3074
|
+
// TODO
|
|
3075
|
+
|
|
3025
3076
|
int whisper_set_mel_with_state(
|
|
3026
3077
|
struct whisper_context * /*ctx*/,
|
|
3027
3078
|
struct whisper_state * state,
|
|
@@ -3089,7 +3140,6 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
|
|
|
3089
3140
|
return false;
|
|
3090
3141
|
}
|
|
3091
3142
|
|
|
3092
|
-
|
|
3093
3143
|
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
|
3094
3144
|
log("%s: failed to eval\n", __func__);
|
|
3095
3145
|
return 1;
|
|
@@ -3323,7 +3373,6 @@ float * whisper_get_logits(struct whisper_context * ctx) {
|
|
|
3323
3373
|
return ctx->state->logits.data();
|
|
3324
3374
|
}
|
|
3325
3375
|
|
|
3326
|
-
|
|
3327
3376
|
float * whisper_get_logits_from_state(struct whisper_state * state) {
|
|
3328
3377
|
return state->logits.data();
|
|
3329
3378
|
}
|
|
@@ -3431,6 +3480,7 @@ const char * whisper_print_system_info(void) {
|
|
|
3431
3480
|
s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
|
|
3432
3481
|
s += "BLAS = " + std::to_string(wsp_ggml_cpu_has_blas()) + " | ";
|
|
3433
3482
|
s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | ";
|
|
3483
|
+
s += "SSSE3 = " + std::to_string(wsp_ggml_cpu_has_ssse3()) + " | ";
|
|
3434
3484
|
s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | ";
|
|
3435
3485
|
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
|
3436
3486
|
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
|
@@ -3473,6 +3523,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
3473
3523
|
/*.max_tokens =*/ 0,
|
|
3474
3524
|
|
|
3475
3525
|
/*.speed_up =*/ false,
|
|
3526
|
+
/*.debug_mode =*/ false,
|
|
3476
3527
|
/*.audio_ctx =*/ 0,
|
|
3477
3528
|
|
|
3478
3529
|
/*.tdrz_enable =*/ false,
|
|
@@ -3634,7 +3685,7 @@ static void whisper_process_logits(
|
|
|
3634
3685
|
WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab);
|
|
3635
3686
|
|
|
3636
3687
|
// extract the logits for the last token
|
|
3637
|
-
// we will be mutating and therefore we don't want to use the ctx.logits buffer directly
|
|
3688
|
+
// we will be mutating, and therefore we don't want to use the ctx.logits buffer directly
|
|
3638
3689
|
auto & probs = decoder.probs;
|
|
3639
3690
|
auto & logits = decoder.logits;
|
|
3640
3691
|
auto & logprobs = decoder.logprobs;
|
|
@@ -4035,16 +4086,17 @@ int whisper_full_with_state(
|
|
|
4035
4086
|
|
|
4036
4087
|
result_all.clear();
|
|
4037
4088
|
|
|
4038
|
-
|
|
4039
|
-
|
|
4040
|
-
if (
|
|
4089
|
+
if (n_samples > 0) {
|
|
4090
|
+
// compute log mel spectrogram
|
|
4091
|
+
if (params.speed_up) {
|
|
4092
|
+
// TODO: Replace PV with more advanced algorithm
|
|
4041
4093
|
log("%s: failed to compute log mel spectrogram\n", __func__);
|
|
4042
4094
|
return -1;
|
|
4043
|
-
}
|
|
4044
|
-
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4095
|
+
} else {
|
|
4096
|
+
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
|
4097
|
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
|
4098
|
+
return -2;
|
|
4099
|
+
}
|
|
4048
4100
|
}
|
|
4049
4101
|
}
|
|
4050
4102
|
|
|
@@ -4070,14 +4122,16 @@ int whisper_full_with_state(
|
|
|
4070
4122
|
state->t_beg = 0;
|
|
4071
4123
|
state->t_last = 0;
|
|
4072
4124
|
state->tid_last = 0;
|
|
4073
|
-
|
|
4125
|
+
if (n_samples > 0) {
|
|
4126
|
+
state->energy = get_signal_energy(samples, n_samples, 32);
|
|
4127
|
+
}
|
|
4074
4128
|
}
|
|
4075
4129
|
|
|
4076
4130
|
const int seek_start = params.offset_ms/10;
|
|
4077
4131
|
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
|
|
4078
4132
|
|
|
4079
|
-
// if length of spectrogram is less than
|
|
4080
|
-
// basically don't process anything that is less than
|
|
4133
|
+
// if length of spectrogram is less than 1.0s (100 frames), then return
|
|
4134
|
+
// basically don't process anything that is less than 1.0s
|
|
4081
4135
|
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
|
4082
4136
|
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
|
4083
4137
|
return 0;
|
|
@@ -4207,7 +4261,7 @@ int whisper_full_with_state(
|
|
|
4207
4261
|
while (true) {
|
|
4208
4262
|
if (params.progress_callback) {
|
|
4209
4263
|
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
|
4210
|
-
|
|
4264
|
+
|
|
4211
4265
|
params.progress_callback(
|
|
4212
4266
|
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
|
4213
4267
|
}
|
|
@@ -4762,7 +4816,6 @@ int whisper_full_with_state(
|
|
|
4762
4816
|
return 0;
|
|
4763
4817
|
}
|
|
4764
4818
|
|
|
4765
|
-
|
|
4766
4819
|
int whisper_full(
|
|
4767
4820
|
struct whisper_context * ctx,
|
|
4768
4821
|
struct whisper_full_params params,
|
|
@@ -4839,7 +4892,6 @@ int whisper_full_parallel(
|
|
|
4839
4892
|
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
|
4840
4893
|
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
|
4841
4894
|
|
|
4842
|
-
|
|
4843
4895
|
// make sure that segments are not overlapping
|
|
4844
4896
|
if (!ctx->state->result_all.empty()) {
|
|
4845
4897
|
result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
|
package/cpp/whisper.h
CHANGED
|
@@ -346,7 +346,7 @@ extern "C" {
|
|
|
346
346
|
void * user_data);
|
|
347
347
|
|
|
348
348
|
// Parameters for the whisper_full() function
|
|
349
|
-
// If you
|
|
349
|
+
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
|
350
350
|
// whisper_full_default_params()
|
|
351
351
|
struct whisper_full_params {
|
|
352
352
|
enum whisper_sampling_strategy strategy;
|
|
@@ -375,6 +375,7 @@ extern "C" {
|
|
|
375
375
|
// [EXPERIMENTAL] speed-up techniques
|
|
376
376
|
// note: these can significantly reduce the quality of the output
|
|
377
377
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
|
378
|
+
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
|
378
379
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
|
379
380
|
|
|
380
381
|
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -68,13 +68,17 @@ RCT_REMAP_METHOD(initContext,
|
|
|
68
68
|
path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
|
|
71
|
+
int contextId = arc4random_uniform(1000000);
|
|
72
|
+
|
|
73
|
+
RNWhisperContext *context = [RNWhisperContext
|
|
74
|
+
initWithModelPath:path
|
|
75
|
+
contextId:contextId
|
|
76
|
+
];
|
|
72
77
|
if ([context getContext] == NULL) {
|
|
73
78
|
reject(@"whisper_cpp_error", @"Failed to load the model", nil);
|
|
74
79
|
return;
|
|
75
80
|
}
|
|
76
81
|
|
|
77
|
-
int contextId = arc4random_uniform(1000000);
|
|
78
82
|
[contexts setObject:context forKey:[NSNumber numberWithInt:contextId]];
|
|
79
83
|
|
|
80
84
|
resolve([NSNumber numberWithInt:contextId]);
|
|
@@ -122,36 +126,36 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
122
126
|
reject(@"whisper_error", @"Invalid file", nil);
|
|
123
127
|
return;
|
|
124
128
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
129
|
+
[context transcribeFile:jobId
|
|
130
|
+
audioData:waveFile
|
|
131
|
+
audioDataCount:count
|
|
132
|
+
options:options
|
|
133
|
+
onProgress: ^(int progress) {
|
|
134
|
+
if (rn_whisper_transcribe_is_aborted(jobId)) {
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
dispatch_async(dispatch_get_main_queue(), ^{
|
|
138
|
+
[self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
|
|
139
|
+
body:@{
|
|
140
|
+
@"contextId": [NSNumber numberWithInt:contextId],
|
|
141
|
+
@"jobId": [NSNumber numberWithInt:jobId],
|
|
142
|
+
@"progress": [NSNumber numberWithInt:progress]
|
|
143
|
+
}
|
|
144
|
+
];
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
onEnd: ^(int code) {
|
|
148
|
+
if (code != 0) {
|
|
149
|
+
free(waveFile);
|
|
150
|
+
reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
|
|
151
|
+
return;
|
|
143
152
|
}
|
|
144
|
-
];
|
|
145
|
-
if (code != 0) {
|
|
146
153
|
free(waveFile);
|
|
147
|
-
|
|
148
|
-
|
|
154
|
+
NSMutableDictionary *result = [context getTextSegments];
|
|
155
|
+
result[@"isAborted"] = @([context isStoppedByAction]);
|
|
156
|
+
resolve(result);
|
|
149
157
|
}
|
|
150
|
-
|
|
151
|
-
NSMutableDictionary *result = [context getTextSegments];
|
|
152
|
-
result[@"isAborted"] = @([context isStoppedByAction]);
|
|
153
|
-
resolve(result);
|
|
154
|
-
});
|
|
158
|
+
];
|
|
155
159
|
}
|
|
156
160
|
|
|
157
161
|
RCT_REMAP_METHOD(startRealtimeTranscribe,
|
|
@@ -260,7 +264,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
260
264
|
}
|
|
261
265
|
|
|
262
266
|
- (void)invalidate {
|
|
263
|
-
|
|
267
|
+
[super invalidate];
|
|
264
268
|
|
|
265
269
|
if (contexts == nil) {
|
|
266
270
|
return;
|
|
@@ -271,6 +275,8 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
271
275
|
[context invalidate];
|
|
272
276
|
}
|
|
273
277
|
|
|
278
|
+
rn_whisper_abort_all_transcribe(); // graceful abort
|
|
279
|
+
|
|
274
280
|
[contexts removeAllObjects];
|
|
275
281
|
contexts = nil;
|
|
276
282
|
|
package/ios/RNWhisperContext.h
CHANGED
|
@@ -36,21 +36,26 @@ typedef struct {
|
|
|
36
36
|
} RNWhisperContextRecordState;
|
|
37
37
|
|
|
38
38
|
@interface RNWhisperContext : NSObject {
|
|
39
|
+
int contextId;
|
|
40
|
+
dispatch_queue_t dQueue;
|
|
39
41
|
struct whisper_context * ctx;
|
|
40
42
|
RNWhisperContextRecordState recordState;
|
|
41
43
|
}
|
|
42
44
|
|
|
43
|
-
+ (instancetype)initWithModelPath:(NSString *)modelPath;
|
|
45
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId;
|
|
44
46
|
- (struct whisper_context *)getContext;
|
|
47
|
+
- (dispatch_queue_t)getDispatchQueue;
|
|
45
48
|
- (OSStatus)transcribeRealtime:(int)jobId
|
|
46
49
|
options:(NSDictionary *)options
|
|
47
50
|
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe;
|
|
48
|
-
- (
|
|
51
|
+
- (void)transcribeFile:(int)jobId
|
|
49
52
|
audioData:(float *)audioData
|
|
50
53
|
audioDataCount:(int)audioDataCount
|
|
51
54
|
options:(NSDictionary *)options
|
|
52
|
-
onProgress:(void (^)(int))onProgress
|
|
55
|
+
onProgress:(void (^)(int))onProgress
|
|
56
|
+
onEnd:(void (^)(int))onEnd;
|
|
53
57
|
- (void)stopTranscribe:(int)jobId;
|
|
58
|
+
- (void)stopCurrentTranscribe;
|
|
54
59
|
- (bool)isCapturing;
|
|
55
60
|
- (bool)isTranscribing;
|
|
56
61
|
- (bool)isStoppedByAction;
|