whisper.rn 0.5.0-rc.0 → 0.5.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +128 -50
  2. package/android/build.gradle +1 -0
  3. package/android/src/main/CMakeLists.txt +1 -0
  4. package/android/src/main/java/com/rnwhisper/RNWhisper.java +35 -0
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +33 -0
  6. package/android/src/main/jni.cpp +81 -0
  7. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  8. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  9. package/cpp/jsi/RNWhisperJSI.cpp +42 -6
  10. package/ios/RNWhisper.mm +11 -0
  11. package/ios/RNWhisperContext.h +1 -0
  12. package/ios/RNWhisperContext.mm +46 -0
  13. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  14. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  15. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  16. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  17. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  18. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  19. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  20. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  21. package/lib/commonjs/AudioSessionIos.js +2 -1
  22. package/lib/commonjs/AudioSessionIos.js.map +1 -1
  23. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  24. package/lib/commonjs/index.js +50 -10
  25. package/lib/commonjs/index.js.map +1 -1
  26. package/lib/commonjs/jest-mock.js +126 -0
  27. package/lib/commonjs/jest-mock.js.map +1 -0
  28. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +857 -0
  29. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  30. package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
  31. package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
  32. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
  33. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  34. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
  35. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  36. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
  37. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  38. package/lib/commonjs/realtime-transcription/index.js +27 -0
  39. package/lib/commonjs/realtime-transcription/index.js.map +1 -0
  40. package/lib/commonjs/realtime-transcription/types.js +114 -0
  41. package/lib/commonjs/realtime-transcription/types.js.map +1 -0
  42. package/lib/commonjs/utils/WavFileReader.js +158 -0
  43. package/lib/commonjs/utils/WavFileReader.js.map +1 -0
  44. package/lib/commonjs/utils/WavFileWriter.js +181 -0
  45. package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
  46. package/lib/commonjs/utils/common.js +25 -0
  47. package/lib/commonjs/utils/common.js.map +1 -0
  48. package/lib/module/AudioSessionIos.js +2 -1
  49. package/lib/module/AudioSessionIos.js.map +1 -1
  50. package/lib/module/NativeRNWhisper.js.map +1 -1
  51. package/lib/module/index.js +48 -10
  52. package/lib/module/index.js.map +1 -1
  53. package/lib/module/jest-mock.js +124 -0
  54. package/lib/module/jest-mock.js.map +1 -0
  55. package/lib/module/realtime-transcription/RealtimeTranscriber.js +851 -0
  56. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  57. package/lib/module/realtime-transcription/SliceManager.js +226 -0
  58. package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
  59. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
  60. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  61. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
  62. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  63. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
  64. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  65. package/lib/module/realtime-transcription/index.js +8 -0
  66. package/lib/module/realtime-transcription/index.js.map +1 -0
  67. package/lib/module/realtime-transcription/types.js +107 -0
  68. package/lib/module/realtime-transcription/types.js.map +1 -0
  69. package/lib/module/utils/WavFileReader.js +151 -0
  70. package/lib/module/utils/WavFileReader.js.map +1 -0
  71. package/lib/module/utils/WavFileWriter.js +174 -0
  72. package/lib/module/utils/WavFileWriter.js.map +1 -0
  73. package/lib/module/utils/common.js +18 -0
  74. package/lib/module/utils/common.js.map +1 -0
  75. package/lib/typescript/AudioSessionIos.d.ts +1 -1
  76. package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
  77. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  78. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  79. package/lib/typescript/index.d.ts +8 -4
  80. package/lib/typescript/index.d.ts.map +1 -1
  81. package/lib/typescript/jest-mock.d.ts +2 -0
  82. package/lib/typescript/jest-mock.d.ts.map +1 -0
  83. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +166 -0
  84. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
  85. package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
  86. package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
  87. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
  88. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
  89. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
  90. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
  91. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
  92. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
  93. package/lib/typescript/realtime-transcription/index.d.ts +6 -0
  94. package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
  95. package/lib/typescript/realtime-transcription/types.d.ts +222 -0
  96. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
  97. package/lib/typescript/utils/WavFileReader.d.ts +61 -0
  98. package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
  99. package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
  100. package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
  101. package/lib/typescript/utils/common.d.ts +9 -0
  102. package/lib/typescript/utils/common.d.ts.map +1 -0
  103. package/package.json +23 -11
  104. package/src/AudioSessionIos.ts +3 -2
  105. package/src/NativeRNWhisper.ts +2 -0
  106. package/src/index.ts +74 -22
  107. package/{jest/mock.js → src/jest-mock.ts} +2 -2
  108. package/src/realtime-transcription/RealtimeTranscriber.ts +1015 -0
  109. package/src/realtime-transcription/SliceManager.ts +252 -0
  110. package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
  111. package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
  112. package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
  113. package/src/realtime-transcription/index.ts +34 -0
  114. package/src/realtime-transcription/types.ts +283 -0
  115. package/src/utils/WavFileReader.ts +202 -0
  116. package/src/utils/WavFileWriter.ts +206 -0
  117. package/src/utils/common.ts +17 -0
package/README.md CHANGED
@@ -11,9 +11,9 @@ React Native binding of [whisper.cpp](https://github.com/ggerganov/whisper.cpp).
11
11
  ## Screenshots
12
12
 
13
13
  | <img src="https://github.com/mybigday/whisper.rn/assets/3001525/2fea7b2d-c911-44fb-9afc-8efc7b594446" width="300" /> | <img src="https://github.com/mybigday/whisper.rn/assets/3001525/a5005a6c-44f7-4db9-95e8-0fd951a2e147" width="300" /> |
14
- | :------------------------------------------: | :------------------------------------------: |
15
- | iOS: Tested on iPhone 13 Pro Max | Android: Tested on Pixel 6 |
16
- | (tiny.en, Core ML enabled, release mode + archive) | (tiny.en, armv8.2-a+fp16, release mode) |
14
+ | :------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
15
+ | iOS: Tested on iPhone 13 Pro Max | Android: Tested on Pixel 6 |
16
+ | (tiny.en, Core ML enabled, release mode + archive) | (tiny.en, armv8.2-a+fp16, release mode) |
17
17
 
18
18
  ## Installation
19
19
 
@@ -49,7 +49,9 @@ You will need to prebuild the project before using it. See [Expo guide](https://
49
49
  If you want to use realtime transcribe, you need to add the microphone permission to your app.
50
50
 
51
51
  ### iOS
52
- Add these lines to ```ios/[YOU_APP_NAME]/info.plist```
52
+
53
+ Add these lines to `ios/[YOU_APP_NAME]/info.plist`
54
+
53
55
  ```xml
54
56
  <key>NSMicrophoneUsageDescription</key>
55
57
  <string>This app requires microphone access in order to transcribe speech</string>
@@ -58,10 +60,13 @@ Add these lines to ```ios/[YOU_APP_NAME]/info.plist```
58
60
  For tvOS, please note that the microphone is not supported.
59
61
 
60
62
  ### Android
61
- Add the following line to ```android/app/src/main/AndroidManifest.xml```
63
+
64
+ Add the following line to `android/app/src/main/AndroidManifest.xml`
65
+
62
66
  ```xml
63
67
  <uses-permission android:name="android.permission.RECORD_AUDIO" />
64
68
  ```
69
+
65
70
  ## Tips & Tricks
66
71
 
67
72
  The [Tips & Tricks](docs/TIPS.md) document is a collection of tips and tricks for using `whisper.rn`.
@@ -83,24 +88,6 @@ const { result } = await promise
83
88
  // result: (The inference text result from audio file)
84
89
  ```
85
90
 
86
- Use realtime transcribe:
87
-
88
- ```js
89
- const { stop, subscribe } = await whisperContext.transcribeRealtime(options)
90
-
91
- subscribe(evt => {
92
- const { isCapturing, data, processTime, recordingTime } = evt
93
- console.log(
94
- `Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
95
- // The inference text result from audio record:
96
- `Result: ${data.result}\n\n` +
97
- `Process time: ${processTime}ms\n` +
98
- `Recording time: ${recordingTime}ms`,
99
- )
100
- if (!isCapturing) console.log('Finished realtime transcribing')
101
- })
102
- ```
103
-
104
91
  ## Voice Activity Detection (VAD)
105
92
 
106
93
  Voice Activity Detection allows you to detect speech segments in audio data using the Silero VAD model.
@@ -157,7 +144,11 @@ const segments = await vadContext.detectSpeechData(base64AudioData, {
157
144
 
158
145
  ```typescript
159
146
  segments.forEach((segment, index) => {
160
- console.log(`Segment ${index + 1}: ${segment.t0.toFixed(2)}s - ${segment.t1.toFixed(2)}s`)
147
+ console.log(
148
+ `Segment ${index + 1}: ${segment.t0.toFixed(2)}s - ${segment.t1.toFixed(
149
+ 2,
150
+ )}s`,
151
+ )
161
152
  console.log(`Duration: ${(segment.t1 - segment.t0).toFixed(2)}s`)
162
153
  })
163
154
  ```
@@ -170,35 +161,57 @@ await vadContext.release()
170
161
  await releaseAllWhisperVad()
171
162
  ```
172
163
 
173
- In iOS, You may need to change the Audio Session so that it can be used with other audio playback, or to optimize the quality of the recording. So we have provided AudioSession utilities for you:
164
+ ## Realtime Transcription
165
+
166
+ The new `RealtimeTranscriber` provides enhanced realtime transcription with features like Voice Activity Detection (VAD), auto-slicing, and memory management.
174
167
 
175
- Option 1 - Use options in transcribeRealtime:
176
168
  ```js
177
- import { AudioSessionIos } from 'whisper.rn'
169
+ // If your RN packager is not enable package exports support, use whisper.rn/src/realtime-transcription
170
+ import { RealtimeTranscriber } from 'whisper.rn/realtime-transcription'
171
+ import { AudioPcmStreamAdapter } from 'whisper.rn/realtime-transcription/adapters'
172
+ import RNFS from 'react-native-fs' // or any compatible filesystem
178
173
 
179
- const { stop, subscribe } = await whisperContext.transcribeRealtime({
180
- audioSessionOnStartIos: {
181
- category: AudioSessionIos.Category.PlayAndRecord,
182
- options: [AudioSessionIos.CategoryOption.MixWithOthers],
183
- mode: AudioSessionIos.Mode.Default,
184
- },
185
- audioSessionOnStopIos: 'restore', // Or an AudioSessionSettingIos
174
+ // Dependencies
175
+ const whisperContext = await initWhisper({
176
+ /* ... */
177
+ })
178
+ const vadContext = await initWhisperVad({
179
+ /* ... */
186
180
  })
181
+ const audioStream = new AudioPcmStreamAdapter() // requires @fugood/react-native-audio-pcm-stream
182
+
183
+ // Create transcriber
184
+ const transcriber = new RealtimeTranscriber(
185
+ { whisperContext, vadContext, audioStream, fs: RNFS },
186
+ {
187
+ audioSliceSec: 30,
188
+ vadPreset: 'default',
189
+ autoSliceOnSpeechEnd: true,
190
+ transcribeOptions: { language: 'en' },
191
+ },
192
+ {
193
+ onTranscribe: (event) => console.log('Transcription:', event.data?.result),
194
+ onVad: (event) => console.log('VAD:', event.type, event.confidence),
195
+ onStatusChange: (isActive) =>
196
+ console.log('Status:', isActive ? 'ACTIVE' : 'INACTIVE'),
197
+ onError: (error) => console.error('Error:', error),
198
+ },
199
+ )
200
+
201
+ // Start/stop transcription
202
+ await transcriber.start()
203
+ await transcriber.stop()
187
204
  ```
188
205
 
189
- Option 2 - Manage the Audio Session in anywhere:
190
- ```js
191
- import { AudioSessionIos } from 'whisper.rn'
206
+ **Dependencies:**
192
207
 
193
- await AudioSessionIos.setCategory(
194
- AudioSessionIos.Category.PlayAndRecord, [AudioSessionIos.CategoryOption.MixWithOthers],
195
- )
196
- await AudioSessionIos.setMode(AudioSessionIos.Mode.Default)
197
- await AudioSessionIos.setActive(true)
198
- // Then you can start do recording
199
- ```
208
+ - `@fugood/react-native-audio-pcm-stream` for `AudioPcmStreamAdapter`
209
+ - Compatible filesystem module (e.g., `react-native-fs`). See [filesystem interface](src/utils/WavFileWriter.ts#L9-L16) for TypeScript definition
200
210
 
201
- In Android, you may need to request the microphone permission by [`PermissionAndroid`](https://reactnative.dev/docs/permissionsandroid).
211
+ **Custom Audio Adapters:**
212
+ You can create custom audio stream adapters by implementing the [AudioStreamInterface](src/realtime-transcription/types.ts#L21-L30). This allows integration with different audio sources or custom audio processing pipelines.
213
+
214
+ **Example:** See [complete example](example/src/RealtimeTranscriber.tsx) for full implementation including file simulation and UI.
202
215
 
203
216
  Please visit the [Documentation](docs/) for more details.
204
217
 
@@ -213,8 +226,10 @@ const whisperContext = await initWhisper({
213
226
  filePath: require('../assets/ggml-tiny.en.bin'),
214
227
  })
215
228
 
216
- const { stop, promise } =
217
- whisperContext.transcribe(require('../assets/sample.wav'), options)
229
+ const { stop, promise } = whisperContext.transcribe(
230
+ require('../assets/sample.wav'),
231
+ options,
232
+ )
218
233
 
219
234
  // ...
220
235
  ```
@@ -233,18 +248,19 @@ module.exports = {
233
248
  ...defaultAssetExts,
234
249
  'bin', // whisper.rn: ggml model binary
235
250
  'mil', // whisper.rn: CoreML model asset
236
- ]
251
+ ],
237
252
  },
238
253
  }
239
254
  ```
240
255
 
241
256
  Please note that:
257
+
242
258
  - It will significantly increase the size of the app in release mode.
243
259
  - The RN packager is not allowed file size larger than 2GB, so it not able to use original f16 `large` model (2.9GB), you can use quantized models instead.
244
260
 
245
261
  ## Core ML support
246
262
 
247
- __*Platform: iOS 15.0+, tvOS 15.0+*__
263
+ **_Platform: iOS 15.0+, tvOS 15.0+_**
248
264
 
249
265
  To use Core ML on iOS, you will need to have the Core ML model files.
250
266
 
@@ -301,9 +317,71 @@ Please follow the [Development Workflow section of contributing guide](./CONTRIB
301
317
  We have provided a mock version of `whisper.rn` for testing purpose you can use on Jest:
302
318
 
303
319
  ```js
304
- jest.mock('whisper.rn', () => require('whisper.rn/jest/mock'))
320
+ jest.mock('whisper.rn', () => require('whisper.rn/jest-mock'))
321
+ ```
322
+
323
+ ## Deprecated APIs
324
+
325
+ ### `transcribeRealtime` (Deprecated)
326
+
327
+ > ⚠️ **Deprecated**: Use `RealtimeTranscriber` instead for enhanced features and better performance.
328
+
329
+ ```js
330
+ const { stop, subscribe } = await whisperContext.transcribeRealtime(options)
331
+
332
+ subscribe((evt) => {
333
+ const { isCapturing, data, processTime, recordingTime } = evt
334
+ console.log(
335
+ `Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
336
+ `Result: ${data.result}\n\n` +
337
+ `Process time: ${processTime}ms\n` +
338
+ `Recording time: ${recordingTime}ms`,
339
+ )
340
+ if (!isCapturing) console.log('Finished realtime transcribing')
341
+ })
305
342
  ```
306
343
 
344
+ In iOS, You may need to change the Audio Session so that it can be used with other audio playback, or to optimize the quality of the recording. So we have provided AudioSession utilities for you:
345
+
346
+ Option 1 - Use options in transcribeRealtime:
347
+
348
+ ```js
349
+ import { AudioSessionIos } from 'whisper.rn'
350
+
351
+ const { stop, subscribe } = await whisperContext.transcribeRealtime({
352
+ audioSessionOnStartIos: {
353
+ category: AudioSessionIos.Category.PlayAndRecord,
354
+ options: [AudioSessionIos.CategoryOption.MixWithOthers],
355
+ mode: AudioSessionIos.Mode.Default,
356
+ },
357
+ audioSessionOnStopIos: 'restore', // Or an AudioSessionSettingIos
358
+ })
359
+ ```
360
+
361
+ Option 2 - Manage the Audio Session in anywhere:
362
+
363
+ ```js
364
+ import { AudioSessionIos } from 'whisper.rn'
365
+
366
+ await AudioSessionIos.setCategory(AudioSessionIos.Category.PlayAndRecord, [
367
+ AudioSessionIos.CategoryOption.MixWithOthers,
368
+ ])
369
+ await AudioSessionIos.setMode(AudioSessionIos.Mode.Default)
370
+ await AudioSessionIos.setActive(true)
371
+ // Then you can start do recording
372
+ ```
373
+
374
+ In Android, you may need to request the microphone permission by [`PermissionAndroid`](https://reactnative.dev/docs/permissionsandroid).
375
+
376
+ ## Apps using `whisper.rn`
377
+
378
+ - [BRICKS](https://bricks.tools): Our product for building interactive signage in simple way. We provide LLM functions as Generator LLM/Assistant.
379
+ - ... (Any Contribution is welcome)
380
+
381
+ ## Node.js binding
382
+
383
+ - [whisper.node](https://github.com/mybigday/whisper.node): An another Node.js binding of `whisper.cpp` but made API same as `whisper.rn`.
384
+
307
385
  ## Contributing
308
386
 
309
387
  See the [contributing guide](CONTRIBUTING.md) to learn how to contribute to the repository and the development workflow.
@@ -102,6 +102,7 @@ android {
102
102
  "**/libreactnative.so",
103
103
  "**/libreactnativejni.so",
104
104
  "**/libturbomodulejsijni.so",
105
+ "**/libreact_nativemodule_core.so",
105
106
  ]
106
107
  }
107
108
 
@@ -110,6 +110,7 @@ build_library("rnwhisper" "generic" "")
110
110
 
111
111
  if (${ANDROID_ABI} STREQUAL "arm64-v8a")
112
112
  build_library("rnwhisper_v8fp16_va_2" "arm" "-march=armv8.2-a+fp16")
113
+ build_library("rnwhisper_v8" "arm" "-march=armv8-a")
113
114
  elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
114
115
  build_library("rnwhisper_vfpv4" "arm" "-mfpu=neon-vfpv4")
115
116
  elseif (${ANDROID_ABI} STREQUAL "x86_64")
@@ -64,6 +64,10 @@ public class RNWhisper implements LifecycleEventListener {
64
64
  }
65
65
 
66
66
  public void installJSIBindings(Promise promise) {
67
+ if (!WhisperContext.isNativeLibraryLoaded()) {
68
+ promise.reject("Native library not loaded");
69
+ return;
70
+ }
67
71
 
68
72
  AsyncTask task = new AsyncTask<Void, Void, Void>() {
69
73
  private Exception exception;
@@ -95,6 +99,37 @@ public class RNWhisper implements LifecycleEventListener {
95
99
  tasks.put(task, "installJSIBindings");
96
100
  }
97
101
 
102
+ public void toggleNativeLog(boolean enabled, Promise promise) {
103
+ if (!WhisperContext.isNativeLibraryLoaded()) {
104
+ promise.reject("Native library not loaded");
105
+ return;
106
+ }
107
+
108
+ new AsyncTask<Void, Void, Boolean>() {
109
+ private Exception exception;
110
+
111
+ @Override
112
+ protected Boolean doInBackground(Void... voids) {
113
+ try {
114
+ WhisperContext.toggleNativeLog(reactContext, enabled);
115
+ return true;
116
+ } catch (Exception e) {
117
+ exception = e;
118
+ }
119
+ return null;
120
+ }
121
+
122
+ @Override
123
+ protected void onPostExecute(Boolean result) {
124
+ if (exception != null) {
125
+ promise.reject(exception);
126
+ return;
127
+ }
128
+ promise.resolve(result);
129
+ }
130
+ }.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
131
+ }
132
+
98
133
  private int getResourceIdentifier(String filePath) {
99
134
  int identifier = reactContext.getResources().getIdentifier(
100
135
  filePath,
@@ -29,6 +29,29 @@ public class WhisperContext {
29
29
 
30
30
  private static String loadedLibrary = "";
31
31
 
32
+ private static class NativeLogCallback {
33
+ DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
34
+
35
+ public NativeLogCallback(ReactApplicationContext reactContext) {
36
+ this.eventEmitter = reactContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class);
37
+ }
38
+
39
+ void emitNativeLog(String level, String text) {
40
+ WritableMap event = Arguments.createMap();
41
+ event.putString("level", level);
42
+ event.putString("text", text);
43
+ eventEmitter.emit("@RNWhisper_onNativeLog", event);
44
+ }
45
+ }
46
+
47
+ static void toggleNativeLog(ReactApplicationContext reactContext, boolean enabled) {
48
+ if (enabled) {
49
+ setupLog(new NativeLogCallback(reactContext));
50
+ } else {
51
+ unsetLog();
52
+ }
53
+ }
54
+
32
55
  private static final int SAMPLE_RATE = 16000;
33
56
  private static final int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
34
57
  private static final int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
@@ -454,6 +477,10 @@ public class WhisperContext {
454
477
  Log.d(NAME, "Loading librnwhisper_v8fp16_va_2.so");
455
478
  System.loadLibrary("rnwhisper_v8fp16_va_2");
456
479
  loadedLibrary = "rnwhisper_v8fp16_va_2";
480
+ } else {
481
+ Log.d(NAME, "Loading librnwhisper_v8.so");
482
+ System.loadLibrary("rnwhisper_v8");
483
+ loadedLibrary = "rnwhisper_v8";
457
484
  }
458
485
  } else if (WhisperContext.isArmeabiV7a()) {
459
486
  Log.d(NAME, "Loading librnwhisper_vfpv4.so");
@@ -468,6 +495,10 @@ public class WhisperContext {
468
495
  }
469
496
  }
470
497
 
498
+ public static boolean isNativeLibraryLoaded() {
499
+ return loadedLibrary != "";
500
+ }
501
+
471
502
  public static boolean isArm64V8a() {
472
503
  return Build.SUPPORTED_ABIS[0].equals("arm64-v8a");
473
504
  }
@@ -571,4 +602,6 @@ public class WhisperContext {
571
602
  // JSI Installation
572
603
  protected static native void installJSIBindings(long runtimePtr, Object callInvokerHolder);
573
604
  protected static native void cleanupJSIBindings();
605
+ protected static native void setupLog(NativeLogCallback logCallback);
606
+ protected static native void unsetLog();
574
607
  }
@@ -23,6 +23,64 @@
23
23
  #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
24
24
  #define LOGW(...) __android_log_print(ANDROID_LOG_WARN, TAG, __VA_ARGS__)
25
25
 
26
+ struct log_callback_context {
27
+ JavaVM *jvm;
28
+ jobject callback;
29
+ };
30
+
31
+ static void rnwhisper_log_callback_default(enum wsp_ggml_log_level level, const char * fmt, void * data) {
32
+ if (level == WSP_GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
33
+ else if (level == WSP_GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
34
+ else if (level == WSP_GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
35
+ else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
36
+ }
37
+
38
+ static void rnwhisper_log_callback_to_j(enum wsp_ggml_log_level level, const char * text, void * data) {
39
+ const char* level_c = "";
40
+ if (level == WSP_GGML_LOG_LEVEL_ERROR) {
41
+ __android_log_print(ANDROID_LOG_ERROR, TAG, text, nullptr);
42
+ level_c = "error";
43
+ } else if (level == WSP_GGML_LOG_LEVEL_INFO) {
44
+ __android_log_print(ANDROID_LOG_INFO, TAG, text, nullptr);
45
+ level_c = "info";
46
+ } else if (level == WSP_GGML_LOG_LEVEL_WARN) {
47
+ __android_log_print(ANDROID_LOG_WARN, TAG, text, nullptr);
48
+ level_c = "warn";
49
+ } else {
50
+ __android_log_print(ANDROID_LOG_DEFAULT, TAG, text, nullptr);
51
+ }
52
+
53
+ log_callback_context *cb_ctx = (log_callback_context *) data;
54
+
55
+ JNIEnv *env;
56
+ bool need_detach = false;
57
+ int getEnvResult = cb_ctx->jvm->GetEnv((void**)&env, JNI_VERSION_1_6);
58
+
59
+ if (getEnvResult == JNI_EDETACHED) {
60
+ if (cb_ctx->jvm->AttachCurrentThread(&env, nullptr) == JNI_OK) {
61
+ need_detach = true;
62
+ } else {
63
+ return;
64
+ }
65
+ } else if (getEnvResult != JNI_OK) {
66
+ return;
67
+ }
68
+
69
+ jobject callback = cb_ctx->callback;
70
+ jclass cb_class = env->GetObjectClass(callback);
71
+ jmethodID emitNativeLog = env->GetMethodID(cb_class, "emitNativeLog", "(Ljava/lang/String;Ljava/lang/String;)V");
72
+
73
+ jstring level_str = env->NewStringUTF(level_c);
74
+ jstring text_str = env->NewStringUTF(text);
75
+ env->CallVoidMethod(callback, emitNativeLog, level_str, text_str);
76
+ env->DeleteLocalRef(level_str);
77
+ env->DeleteLocalRef(text_str);
78
+
79
+ if (need_detach) {
80
+ cb_ctx->jvm->DetachCurrentThread();
81
+ }
82
+ }
83
+
26
84
  static inline int min(int a, int b) {
27
85
  return (a < b) ? a : b;
28
86
  }
@@ -800,7 +858,30 @@ Java_com_rnwhisper_WhisperContext_cleanupJSIBindings(
800
858
  JNIEnv *env,
801
859
  jclass clazz
802
860
  ) {
861
+ UNUSED(env);
862
+ UNUSED(clazz);
803
863
  rnwhisper_jsi::cleanupJSIBindings();
804
864
  }
805
865
 
866
+ JNIEXPORT void JNICALL
867
+ Java_com_rnwhisper_WhisperContext_setupLog(JNIEnv *env, jobject thiz, jobject logCallback) {
868
+ UNUSED(thiz);
869
+
870
+ log_callback_context *cb_ctx = new log_callback_context;
871
+
872
+ JavaVM *jvm;
873
+ env->GetJavaVM(&jvm);
874
+ cb_ctx->jvm = jvm;
875
+ cb_ctx->callback = env->NewGlobalRef(logCallback);
876
+
877
+ whisper_log_set(rnwhisper_log_callback_to_j, cb_ctx);
878
+ }
879
+
880
+ JNIEXPORT void JNICALL
881
+ Java_com_rnwhisper_WhisperContext_unsetLog(JNIEnv *env, jobject thiz) {
882
+ UNUSED(env);
883
+ UNUSED(thiz);
884
+ whisper_log_set(rnwhisper_log_callback_default, NULL);
885
+ }
886
+
806
887
  } // extern "C"
@@ -31,6 +31,11 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
31
31
  rnwhisper.installJSIBindings(promise);
32
32
  }
33
33
 
34
+ @ReactMethod
35
+ public void toggleNativeLog(boolean enabled, Promise promise) {
36
+ rnwhisper.toggleNativeLog(enabled, promise);
37
+ }
38
+
34
39
  @Override
35
40
  @NonNull
36
41
  public String getName() {
@@ -31,6 +31,11 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
31
31
  rnwhisper.installJSIBindings(promise);
32
32
  }
33
33
 
34
+ @ReactMethod
35
+ public void toggleNativeLog(boolean enabled, Promise promise) {
36
+ rnwhisper.toggleNativeLog(enabled, promise);
37
+ }
38
+
34
39
  @Override
35
40
  @NonNull
36
41
  public String getName() {
@@ -295,6 +295,38 @@ CallbackInfo extractCallbacks(Runtime& runtime, const Object& optionsObj) {
295
295
  return info;
296
296
  }
297
297
 
298
+ // Helper function to extract VAD parameters from options
299
+ whisper_vad_params extractVadParams(Runtime& runtime, const Object& optionsObj) {
300
+ whisper_vad_params vadParams = whisper_vad_default_params();
301
+
302
+ try {
303
+ auto propNames = optionsObj.getPropertyNames(runtime);
304
+ for (size_t i = 0; i < propNames.size(runtime); i++) {
305
+ auto propNameValue = propNames.getValueAtIndex(runtime, i);
306
+ std::string propName = propNameValue.getString(runtime).utf8(runtime);
307
+ Value propValue = optionsObj.getProperty(runtime, propNameValue.getString(runtime));
308
+
309
+ if (propName == "threshold" && propValue.isNumber()) {
310
+ vadParams.threshold = (float)propValue.getNumber();
311
+ } else if (propName == "minSpeechDurationMs" && propValue.isNumber()) {
312
+ vadParams.min_speech_duration_ms = (int)propValue.getNumber();
313
+ } else if (propName == "minSilenceDurationMs" && propValue.isNumber()) {
314
+ vadParams.min_silence_duration_ms = (int)propValue.getNumber();
315
+ } else if (propName == "maxSpeechDurationS" && propValue.isNumber()) {
316
+ vadParams.max_speech_duration_s = (float)propValue.getNumber();
317
+ } else if (propName == "speechPadMs" && propValue.isNumber()) {
318
+ vadParams.speech_pad_ms = (int)propValue.getNumber();
319
+ } else if (propName == "samplesOverlap" && propValue.isNumber()) {
320
+ vadParams.samples_overlap = (float)propValue.getNumber();
321
+ }
322
+ }
323
+ } catch (...) {
324
+ // Ignore parameter extraction errors
325
+ }
326
+
327
+ return vadParams;
328
+ }
329
+
298
330
  // Helper function to create segments array
299
331
  Array createSegmentsArray(Runtime& runtime, struct whisper_context* ctx, int offset) {
300
332
  int n_segments = whisper_full_n_segments(ctx);
@@ -355,10 +387,13 @@ Value createPromiseTask(
355
387
 
356
388
  whisper_full_params params = {};
357
389
  CallbackInfo callbackInfo = {};
390
+ whisper_vad_params vadParams = {};
358
391
  if (functionName == "whisperTranscribeData") {
359
392
  params = createFullParamsFromJSI(runtime, optionsObj);
360
393
  // Extract data from optionsObj before lambda capture
361
394
  callbackInfo = extractCallbacks(runtime, optionsObj);
395
+ } else if (functionName == "whisperVadDetectSpeech") {
396
+ vadParams = extractVadParams(runtime, optionsObj);
362
397
  }
363
398
 
364
399
  // Create promise
@@ -368,7 +403,7 @@ Value createPromiseTask(
368
403
  runtime,
369
404
  PropNameID::forAscii(runtime, ""),
370
405
  2, // resolve, reject
371
- [contextId, audioResult, params, callbackInfo, task, callInvoker, functionName](Runtime& runtime, const Value& thisValue, const Value* arguments, size_t count) -> Value {
406
+ [contextId, audioResult, params, callbackInfo, vadParams, task, callInvoker, functionName](Runtime& runtime, const Value& thisValue, const Value* arguments, size_t count) -> Value {
372
407
  if (count != 2) {
373
408
  throw JSError(runtime, "Promise executor expects 2 arguments (resolve, reject)");
374
409
  }
@@ -379,10 +414,10 @@ Value createPromiseTask(
379
414
 
380
415
  // Execute task in ThreadPool
381
416
  auto future = getWhisperThreadPool().enqueue([
382
- contextId, audioResult, params, callbackInfo, task, resolvePtr, rejectPtr, callInvoker, safeRuntime, functionName]() {
417
+ contextId, audioResult, params, callbackInfo, vadParams, task, resolvePtr, rejectPtr, callInvoker, safeRuntime, functionName]() {
383
418
 
384
419
  try {
385
- task(contextId, audioResult, params, callbackInfo, resolvePtr, rejectPtr, callInvoker, safeRuntime);
420
+ task(contextId, audioResult, params, callbackInfo, vadParams, resolvePtr, rejectPtr, callInvoker, safeRuntime);
386
421
  } catch (...) {
387
422
  callInvoker->invokeAsync([rejectPtr, safeRuntime, functionName]() {
388
423
  auto& runtime = *safeRuntime;
@@ -413,7 +448,7 @@ void installJSIBindings(
413
448
  try {
414
449
  return createPromiseTask<whisper_context>(
415
450
  runtime, "whisperTranscribeData", callInvoker, arguments, count,
416
- [](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo,
451
+ [](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo, const whisper_vad_params& vadParams,
417
452
  std::shared_ptr<Function> resolvePtr, std::shared_ptr<Function> rejectPtr,
418
453
  std::shared_ptr<facebook::react::CallInvoker> callInvoker,
419
454
  std::shared_ptr<Runtime> safeRuntime) {
@@ -566,7 +601,7 @@ void installJSIBindings(
566
601
  try {
567
602
  return createPromiseTask<whisper_vad_context>(
568
603
  runtime, "whisperVadDetectSpeech", callInvoker, arguments, count,
569
- [](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo,
604
+ [](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo, const whisper_vad_params& vadParams,
570
605
  std::shared_ptr<Function> resolvePtr, std::shared_ptr<Function> rejectPtr,
571
606
  std::shared_ptr<facebook::react::CallInvoker> callInvoker,
572
607
  std::shared_ptr<Runtime> safeRuntime) {
@@ -600,7 +635,8 @@ void installJSIBindings(
600
635
  bool isSpeech = whisper_vad_detect_speech(vadContext, audioResult.data.data(), audioResult.count);
601
636
  logInfo("VAD detection result: %s", isSpeech ? "speech" : "no speech");
602
637
 
603
- struct whisper_vad_params vad_params = whisper_vad_default_params();
638
+ struct whisper_vad_params vad_params = vadParams;
639
+
604
640
  struct whisper_vad_segments* segments = nullptr;
605
641
  if (isSpeech) {
606
642
  segments = whisper_vad_segments_from_probs(vadContext, vad_params);
package/ios/RNWhisper.mm CHANGED
@@ -24,6 +24,16 @@ RCT_EXPORT_MODULE()
24
24
  return NO;
25
25
  }
26
26
 
27
+ RCT_EXPORT_METHOD(toggleNativeLog:(BOOL)enabled) {
28
+ void (^onEmitLog)(NSString *level, NSString *text) = nil;
29
+ if (enabled) {
30
+ onEmitLog = ^(NSString *level, NSString *text) {
31
+ [self sendEventWithName:@"@RNWhisper_onNativeLog" body:@{ @"level": level, @"text": text }];
32
+ };
33
+ }
34
+ [RNWhisperContext toggleNativeLog:enabled onEmitLog:onEmitLog];
35
+ }
36
+
27
37
  - (NSDictionary *)constantsToExport
28
38
  {
29
39
  return @{
@@ -107,6 +117,7 @@ RCT_REMAP_METHOD(initContext,
107
117
  @"@RNWhisper_onTranscribeNewSegments",
108
118
  @"@RNWhisper_onRealtimeTranscribe",
109
119
  @"@RNWhisper_onRealtimeTranscribeEnd",
120
+ @"@RNWhisper_onNativeLog",
110
121
  ];
111
122
  }
112
123
 
@@ -47,6 +47,7 @@ typedef struct {
47
47
  bool isMetalEnabled;
48
48
  }
49
49
 
50
+ + (void)toggleNativeLog:(BOOL)enabled onEmitLog:(void (^)(NSString *level, NSString *text))onEmitLog;
50
51
  + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML noMetal:(BOOL)noMetal useFlashAttn:(BOOL)useFlashAttn;
51
52
  - (bool)isMetalEnabled;
52
53
  - (NSString *)reasonNoMetal;