whisper.rn 0.4.0-rc.3 → 0.4.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +7 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +6 -1
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -135
  7. package/android/src/main/jni-utils.h +76 -0
  8. package/android/src/main/jni.cpp +188 -109
  9. package/cpp/README.md +1 -1
  10. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  11. package/cpp/coreml/whisper-encoder.h +4 -0
  12. package/cpp/coreml/whisper-encoder.mm +4 -2
  13. package/cpp/ggml-alloc.c +451 -282
  14. package/cpp/ggml-alloc.h +74 -8
  15. package/cpp/ggml-backend-impl.h +112 -0
  16. package/cpp/ggml-backend.c +1357 -0
  17. package/cpp/ggml-backend.h +181 -0
  18. package/cpp/ggml-impl.h +243 -0
  19. package/cpp/{ggml-metal.metal → ggml-metal-whisper.metal} +1556 -329
  20. package/cpp/ggml-metal.h +28 -1
  21. package/cpp/ggml-metal.m +1128 -308
  22. package/cpp/ggml-quants.c +7382 -0
  23. package/cpp/ggml-quants.h +224 -0
  24. package/cpp/ggml.c +3848 -5245
  25. package/cpp/ggml.h +353 -155
  26. package/cpp/rn-audioutils.cpp +68 -0
  27. package/cpp/rn-audioutils.h +14 -0
  28. package/cpp/rn-whisper-log.h +11 -0
  29. package/cpp/rn-whisper.cpp +141 -59
  30. package/cpp/rn-whisper.h +47 -15
  31. package/cpp/whisper.cpp +1750 -964
  32. package/cpp/whisper.h +97 -15
  33. package/ios/RNWhisper.mm +15 -9
  34. package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +4 -0
  35. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  36. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  37. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +19 -0
  38. package/ios/RNWhisperAudioUtils.h +0 -2
  39. package/ios/RNWhisperAudioUtils.m +0 -56
  40. package/ios/RNWhisperContext.h +8 -12
  41. package/ios/RNWhisperContext.mm +132 -138
  42. package/jest/mock.js +1 -1
  43. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  44. package/lib/commonjs/index.js +28 -9
  45. package/lib/commonjs/index.js.map +1 -1
  46. package/lib/commonjs/version.json +1 -1
  47. package/lib/module/NativeRNWhisper.js.map +1 -1
  48. package/lib/module/index.js +28 -9
  49. package/lib/module/index.js.map +1 -1
  50. package/lib/module/version.json +1 -1
  51. package/lib/typescript/NativeRNWhisper.d.ts +7 -1
  52. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  53. package/lib/typescript/index.d.ts +7 -2
  54. package/lib/typescript/index.d.ts.map +1 -1
  55. package/package.json +6 -5
  56. package/src/NativeRNWhisper.ts +8 -1
  57. package/src/index.ts +29 -17
  58. package/src/version.json +1 -1
  59. package/whisper-rn.podspec +1 -2
@@ -0,0 +1,68 @@
1
+ #include "rn-audioutils.h"
2
+ #include "rn-whisper-log.h"
3
+
4
+ namespace rnaudioutils {
5
+
6
+ std::vector<uint8_t> concat_short_buffers(const std::vector<short*>& buffers, const std::vector<int>& slice_n_samples) {
7
+ std::vector<uint8_t> output_data;
8
+
9
+ for (size_t i = 0; i < buffers.size(); i++) {
10
+ int size = slice_n_samples[i]; // Number of shorts
11
+ short* slice = buffers[i];
12
+
13
+ // Copy each short as two bytes
14
+ for (int j = 0; j < size; j++) {
15
+ output_data.push_back(static_cast<uint8_t>(slice[j] & 0xFF)); // Lower byte
16
+ output_data.push_back(static_cast<uint8_t>((slice[j] >> 8) & 0xFF)); // Higher byte
17
+ }
18
+ }
19
+
20
+ return output_data;
21
+ }
22
+
23
+ std::vector<uint8_t> remove_trailing_zeros(const std::vector<uint8_t>& audio_data) {
24
+ auto last = std::find_if(audio_data.rbegin(), audio_data.rend(), [](uint8_t byte) { return byte != 0; });
25
+ return std::vector<uint8_t>(audio_data.begin(), last.base());
26
+ }
27
+
28
+ void save_wav_file(const std::vector<uint8_t>& raw, const std::string& file) {
29
+ std::vector<uint8_t> data = remove_trailing_zeros(raw);
30
+
31
+ std::ofstream output(file, std::ios::binary);
32
+
33
+ if (!output.is_open()) {
34
+ RNWHISPER_LOG_ERROR("Failed to open file for writing: %s\n", file.c_str());
35
+ return;
36
+ }
37
+
38
+ // WAVE header
39
+ output.write("RIFF", 4);
40
+ int32_t chunk_size = 36 + static_cast<int32_t>(data.size());
41
+ output.write(reinterpret_cast<char*>(&chunk_size), sizeof(chunk_size));
42
+ output.write("WAVE", 4);
43
+ output.write("fmt ", 4);
44
+ int32_t sub_chunk_size = 16;
45
+ output.write(reinterpret_cast<char*>(&sub_chunk_size), sizeof(sub_chunk_size));
46
+ short audio_format = 1;
47
+ output.write(reinterpret_cast<char*>(&audio_format), sizeof(audio_format));
48
+ short num_channels = 1;
49
+ output.write(reinterpret_cast<char*>(&num_channels), sizeof(num_channels));
50
+ int32_t sample_rate = WHISPER_SAMPLE_RATE;
51
+ output.write(reinterpret_cast<char*>(&sample_rate), sizeof(sample_rate));
52
+ int32_t byte_rate = WHISPER_SAMPLE_RATE * 2;
53
+ output.write(reinterpret_cast<char*>(&byte_rate), sizeof(byte_rate));
54
+ short block_align = 2;
55
+ output.write(reinterpret_cast<char*>(&block_align), sizeof(block_align));
56
+ short bits_per_sample = 16;
57
+ output.write(reinterpret_cast<char*>(&bits_per_sample), sizeof(bits_per_sample));
58
+ output.write("data", 4);
59
+ int32_t sub_chunk2_size = static_cast<int32_t>(data.size());
60
+ output.write(reinterpret_cast<char*>(&sub_chunk2_size), sizeof(sub_chunk2_size));
61
+ output.write(reinterpret_cast<const char*>(data.data()), data.size());
62
+
63
+ output.close();
64
+
65
+ RNWHISPER_LOG_INFO("Saved audio file: %s\n", file.c_str());
66
+ }
67
+
68
+ } // namespace rnaudioutils
@@ -0,0 +1,14 @@
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <vector>
4
+ #include <cstdint>
5
+ #include <cstring>
6
+ #include <algorithm>
7
+ #include "whisper.h"
8
+
9
+ namespace rnaudioutils {
10
+
11
+ std::vector<uint8_t> concat_short_buffers(const std::vector<short*>& buffers, const std::vector<int>& slice_n_samples);
12
+ void save_wav_file(const std::vector<uint8_t>& raw, const std::string& file);
13
+
14
+ } // namespace rnaudioutils
@@ -0,0 +1,11 @@
1
+ #if defined(__ANDROID__) && defined(RNWHISPER_ANDROID_ENABLE_LOGGING)
2
+ #include <android/log.h>
3
+ #define RNWHISPER_ANDROID_TAG "RNWHISPER_LOG_ANDROID"
4
+ #define RNWHISPER_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , RNWHISPER_ANDROID_TAG, __VA_ARGS__)
5
+ #define RNWHISPER_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , RNWHISPER_ANDROID_TAG, __VA_ARGS__)
6
+ #define RNWHISPER_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, RNWHISPER_ANDROID_TAG, __VA_ARGS__)
7
+ #else
8
+ #define RNWHISPER_LOG_INFO(...) fprintf(stderr, __VA_ARGS__)
9
+ #define RNWHISPER_LOG_WARN(...) fprintf(stderr, __VA_ARGS__)
10
+ #define RNWHISPER_LOG_ERROR(...) fprintf(stderr, __VA_ARGS__)
11
+ #endif // __ANDROID__
@@ -2,41 +2,11 @@
2
2
  #include <string>
3
3
  #include <vector>
4
4
  #include <unordered_map>
5
- #include "whisper.h"
5
+ #include "rn-whisper.h"
6
6
 
7
- extern "C" {
7
+ #define DEFAULT_MAX_AUDIO_SEC 30;
8
8
 
9
- std::unordered_map<int, bool> abort_map;
10
-
11
- bool* rn_whisper_assign_abort_map(int job_id) {
12
- abort_map[job_id] = false;
13
- return &abort_map[job_id];
14
- }
15
-
16
- void rn_whisper_remove_abort_map(int job_id) {
17
- if (abort_map.find(job_id) != abort_map.end()) {
18
- abort_map.erase(job_id);
19
- }
20
- }
21
-
22
- void rn_whisper_abort_transcribe(int job_id) {
23
- if (abort_map.find(job_id) != abort_map.end()) {
24
- abort_map[job_id] = true;
25
- }
26
- }
27
-
28
- bool rn_whisper_transcribe_is_aborted(int job_id) {
29
- if (abort_map.find(job_id) != abort_map.end()) {
30
- return abort_map[job_id];
31
- }
32
- return false;
33
- }
34
-
35
- void rn_whisper_abort_all_transcribe() {
36
- for (auto it = abort_map.begin(); it != abort_map.end(); ++it) {
37
- it->second = true;
38
- }
39
- }
9
+ namespace rnwhisper {
40
10
 
41
11
  void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
42
12
  const float rc = 1.0f / (2.0f * M_PI * cutoff);
@@ -51,42 +21,154 @@ void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate
51
21
  }
52
22
  }
53
23
 
54
- bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
55
- const int n_samples = pcmf32.size();
56
- const int n_samples_last = (sample_rate * last_ms) / 1000;
24
+ bool vad_simple_impl(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
25
+ const int n_samples = pcmf32.size();
26
+ const int n_samples_last = (sample_rate * last_ms) / 1000;
57
27
 
58
- if (n_samples_last >= n_samples) {
59
- // not enough samples - assume no speech
60
- return false;
61
- }
28
+ if (n_samples_last >= n_samples) {
29
+ // not enough samples - assume no speech
30
+ return false;
31
+ }
62
32
 
63
- if (freq_thold > 0.0f) {
64
- high_pass_filter(pcmf32, freq_thold, sample_rate);
65
- }
33
+ if (freq_thold > 0.0f) {
34
+ high_pass_filter(pcmf32, freq_thold, sample_rate);
35
+ }
36
+
37
+ float energy_all = 0.0f;
38
+ float energy_last = 0.0f;
39
+
40
+ for (int i = 0; i < n_samples; i++) {
41
+ energy_all += fabsf(pcmf32[i]);
66
42
 
67
- float energy_all = 0.0f;
68
- float energy_last = 0.0f;
43
+ if (i >= n_samples - n_samples_last) {
44
+ energy_last += fabsf(pcmf32[i]);
45
+ }
46
+ }
47
+
48
+ energy_all /= n_samples;
49
+ energy_last /= n_samples_last;
69
50
 
70
- for (int i = 0; i < n_samples; i++) {
71
- energy_all += fabsf(pcmf32[i]);
51
+ if (verbose) {
52
+ RNWHISPER_LOG_INFO("%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
53
+ }
72
54
 
73
- if (i >= n_samples - n_samples_last) {
74
- energy_last += fabsf(pcmf32[i]);
55
+ if (energy_last > vad_thold*energy_all) {
56
+ return false;
75
57
  }
76
- }
77
58
 
78
- energy_all /= n_samples;
79
- energy_last /= n_samples_last;
59
+ return true;
60
+ }
80
61
 
81
- if (verbose) {
82
- fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
83
- }
62
+ void job::set_realtime_params(
63
+ vad_params params,
64
+ int sec,
65
+ int slice_sec,
66
+ const char* output_path
67
+ ) {
68
+ vad = params;
69
+ if (vad.vad_ms < 2000) vad.vad_ms = 2000;
70
+ audio_sec = sec > 0 ? sec : DEFAULT_MAX_AUDIO_SEC;
71
+ audio_slice_sec = slice_sec > 0 && slice_sec < audio_sec ? slice_sec : audio_sec;
72
+ audio_output_path = output_path;
73
+ }
84
74
 
85
- if (energy_last > vad_thold*energy_all) {
75
+ bool job::vad_simple(int slice_index, int n_samples, int n) {
76
+ if (!vad.use_vad) return true;
77
+
78
+ short* pcm = pcm_slices[slice_index];
79
+ int sample_size = (int) (WHISPER_SAMPLE_RATE * vad.vad_ms / 1000);
80
+ if (n_samples + n > sample_size) {
81
+ int start = n_samples + n - sample_size;
82
+ std::vector<float> pcmf32(sample_size);
83
+ for (int i = 0; i < sample_size; i++) {
84
+ pcmf32[i] = (float)pcm[i + start] / 32768.0f;
85
+ }
86
+ return vad_simple_impl(pcmf32, WHISPER_SAMPLE_RATE, vad.last_ms, vad.vad_thold, vad.freq_thold, vad.verbose);
87
+ }
86
88
  return false;
87
- }
89
+ }
90
+
91
+ void job::put_pcm_data(short* data, int slice_index, int n_samples, int n) {
92
+ if (pcm_slices.size() == slice_index) {
93
+ int n_slices = (int) (WHISPER_SAMPLE_RATE * audio_slice_sec);
94
+ pcm_slices.push_back(new short[n_slices]);
95
+ }
96
+ short* pcm = pcm_slices[slice_index];
97
+ for (int i = 0; i < n; i++) {
98
+ pcm[i + n_samples] = data[i];
99
+ }
100
+ }
101
+
102
+ float* job::pcm_slice_to_f32(int slice_index, int size) {
103
+ if (pcm_slices.size() > slice_index) {
104
+ float* pcmf32 = new float[size];
105
+ for (int i = 0; i < size; i++) {
106
+ pcmf32[i] = (float)pcm_slices[slice_index][i] / 32768.0f;
107
+ }
108
+ return pcmf32;
109
+ }
110
+ return nullptr;
111
+ }
112
+
113
+ bool job::is_aborted() {
114
+ return aborted;
115
+ }
116
+
117
+ void job::abort() {
118
+ aborted = true;
119
+ }
120
+
121
+ job::~job() {
122
+ RNWHISPER_LOG_INFO("rnwhisper::job::%s: job_id: %d\n", __func__, job_id);
123
+
124
+ for (size_t i = 0; i < pcm_slices.size(); i++) {
125
+ delete[] pcm_slices[i];
126
+ }
127
+ pcm_slices.clear();
128
+ }
129
+
130
+ std::unordered_map<int, job*> job_map;
131
+
132
+ void job_abort_all() {
133
+ for (auto it = job_map.begin(); it != job_map.end(); ++it) {
134
+ it->second->abort();
135
+ }
136
+ }
137
+
138
+ job* job_new(int job_id, struct whisper_full_params params) {
139
+ job* ctx = new job();
140
+ ctx->job_id = job_id;
141
+ ctx->params = params;
142
+
143
+ job_map[job_id] = ctx;
144
+
145
+ // Abort handler
146
+ params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
147
+ job *j = (job*)user_data;
148
+ return !j->is_aborted();
149
+ };
150
+ params.encoder_begin_callback_user_data = job_map[job_id];
151
+ params.abort_callback = [](void * user_data) {
152
+ job *j = (job*)user_data;
153
+ return j->is_aborted();
154
+ };
155
+ params.abort_callback_user_data = job_map[job_id];
156
+
157
+ return job_map[job_id];
158
+ }
159
+
160
+ job* job_get(int job_id) {
161
+ if (job_map.find(job_id) != job_map.end()) {
162
+ return job_map[job_id];
163
+ }
164
+ return nullptr;
165
+ }
88
166
 
89
- return true;
167
+ void job_remove(int job_id) {
168
+ if (job_map.find(job_id) != job_map.end()) {
169
+ delete job_map[job_id];
170
+ }
171
+ job_map.erase(job_id);
90
172
  }
91
173
 
92
- }
174
+ }
package/cpp/rn-whisper.h CHANGED
@@ -1,17 +1,49 @@
1
+ #ifndef RNWHISPER_H
2
+ #define RNWHISPER_H
1
3
 
2
- #ifdef __cplusplus
3
4
  #include <string>
4
- #include <whisper.h>
5
- extern "C" {
6
- #endif
7
-
8
- bool* rn_whisper_assign_abort_map(int job_id);
9
- void rn_whisper_remove_abort_map(int job_id);
10
- void rn_whisper_abort_transcribe(int job_id);
11
- bool rn_whisper_transcribe_is_aborted(int job_id);
12
- void rn_whisper_abort_all_transcribe();
13
- bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose);
14
-
15
- #ifdef __cplusplus
16
- }
17
- #endif
5
+ #include <vector>
6
+ #include "whisper.h"
7
+ #include "rn-whisper-log.h"
8
+ #include "rn-audioutils.h"
9
+
10
+ namespace rnwhisper {
11
+
12
+ struct vad_params {
13
+ bool use_vad = false;
14
+ float vad_thold = 0.6f;
15
+ float freq_thold = 100.0f;
16
+ int vad_ms = 2000;
17
+ int last_ms = 1000;
18
+ bool verbose = false;
19
+ };
20
+
21
+ struct job {
22
+ int job_id;
23
+ bool aborted = false;
24
+ whisper_full_params params;
25
+
26
+ ~job();
27
+ bool is_aborted();
28
+ void abort();
29
+
30
+ // Realtime transcription only:
31
+ vad_params vad;
32
+ int audio_sec = 0;
33
+ int audio_slice_sec = 0;
34
+ const char* audio_output_path = nullptr;
35
+ std::vector<short *> pcm_slices;
36
+ void set_realtime_params(vad_params vad, int sec, int slice_sec, const char* output_path);
37
+ bool vad_simple(int slice_index, int n_samples, int n);
38
+ void put_pcm_data(short* pcm, int slice_index, int n_samples, int n);
39
+ float* pcm_slice_to_f32(int slice_index, int size);
40
+ };
41
+
42
+ void job_abort_all();
43
+ job* job_new(int job_id, struct whisper_full_params params);
44
+ void job_remove(int job_id);
45
+ job* job_get(int job_id);
46
+
47
+ } // namespace rnwhisper
48
+
49
+ #endif // RNWHISPER_H