whisper.rn 0.5.0-rc.0 → 0.5.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -50
- package/android/build.gradle +1 -0
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +35 -0
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +33 -0
- package/android/src/main/jni.cpp +81 -0
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
- package/cpp/jsi/RNWhisperJSI.cpp +42 -6
- package/ios/RNWhisper.mm +11 -0
- package/ios/RNWhisperContext.h +1 -0
- package/ios/RNWhisperContext.mm +46 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/AudioSessionIos.js +2 -1
- package/lib/commonjs/AudioSessionIos.js.map +1 -1
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +50 -10
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/jest-mock.js +126 -0
- package/lib/commonjs/jest-mock.js.map +1 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +857 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
- package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
- package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
- package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
- package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
- package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/index.js +27 -0
- package/lib/commonjs/realtime-transcription/index.js.map +1 -0
- package/lib/commonjs/realtime-transcription/types.js +114 -0
- package/lib/commonjs/realtime-transcription/types.js.map +1 -0
- package/lib/commonjs/utils/WavFileReader.js +158 -0
- package/lib/commonjs/utils/WavFileReader.js.map +1 -0
- package/lib/commonjs/utils/WavFileWriter.js +181 -0
- package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
- package/lib/commonjs/utils/common.js +25 -0
- package/lib/commonjs/utils/common.js.map +1 -0
- package/lib/module/AudioSessionIos.js +2 -1
- package/lib/module/AudioSessionIos.js.map +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +48 -10
- package/lib/module/index.js.map +1 -1
- package/lib/module/jest-mock.js +124 -0
- package/lib/module/jest-mock.js.map +1 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +851 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
- package/lib/module/realtime-transcription/SliceManager.js +226 -0
- package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
- package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
- package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
- package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/index.js +8 -0
- package/lib/module/realtime-transcription/index.js.map +1 -0
- package/lib/module/realtime-transcription/types.js +107 -0
- package/lib/module/realtime-transcription/types.js.map +1 -0
- package/lib/module/utils/WavFileReader.js +151 -0
- package/lib/module/utils/WavFileReader.js.map +1 -0
- package/lib/module/utils/WavFileWriter.js +174 -0
- package/lib/module/utils/WavFileWriter.js.map +1 -0
- package/lib/module/utils/common.js +18 -0
- package/lib/module/utils/common.js.map +1 -0
- package/lib/typescript/AudioSessionIos.d.ts +1 -1
- package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +1 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +8 -4
- package/lib/typescript/index.d.ts.map +1 -1
- package/lib/typescript/jest-mock.d.ts +2 -0
- package/lib/typescript/jest-mock.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +166 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
- package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
- package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
- package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
- package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/index.d.ts +6 -0
- package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/types.d.ts +222 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
- package/lib/typescript/utils/WavFileReader.d.ts +61 -0
- package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
- package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
- package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
- package/lib/typescript/utils/common.d.ts +9 -0
- package/lib/typescript/utils/common.d.ts.map +1 -0
- package/package.json +23 -11
- package/src/AudioSessionIos.ts +3 -2
- package/src/NativeRNWhisper.ts +2 -0
- package/src/index.ts +74 -22
- package/{jest/mock.js → src/jest-mock.ts} +2 -2
- package/src/realtime-transcription/RealtimeTranscriber.ts +1015 -0
- package/src/realtime-transcription/SliceManager.ts +252 -0
- package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
- package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
- package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
- package/src/realtime-transcription/index.ts +34 -0
- package/src/realtime-transcription/types.ts +283 -0
- package/src/utils/WavFileReader.ts +202 -0
- package/src/utils/WavFileWriter.ts +206 -0
- package/src/utils/common.ts +17 -0
package/README.md
CHANGED
|
@@ -11,9 +11,9 @@ React Native binding of [whisper.cpp](https://github.com/ggerganov/whisper.cpp).
|
|
|
11
11
|
## Screenshots
|
|
12
12
|
|
|
13
13
|
| <img src="https://github.com/mybigday/whisper.rn/assets/3001525/2fea7b2d-c911-44fb-9afc-8efc7b594446" width="300" /> | <img src="https://github.com/mybigday/whisper.rn/assets/3001525/a5005a6c-44f7-4db9-95e8-0fd951a2e147" width="300" /> |
|
|
14
|
-
|
|
|
15
|
-
|
|
|
16
|
-
|
|
|
14
|
+
| :------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
|
|
15
|
+
| iOS: Tested on iPhone 13 Pro Max | Android: Tested on Pixel 6 |
|
|
16
|
+
| (tiny.en, Core ML enabled, release mode + archive) | (tiny.en, armv8.2-a+fp16, release mode) |
|
|
17
17
|
|
|
18
18
|
## Installation
|
|
19
19
|
|
|
@@ -49,7 +49,9 @@ You will need to prebuild the project before using it. See [Expo guide](https://
|
|
|
49
49
|
If you want to use realtime transcribe, you need to add the microphone permission to your app.
|
|
50
50
|
|
|
51
51
|
### iOS
|
|
52
|
-
|
|
52
|
+
|
|
53
|
+
Add these lines to `ios/[YOU_APP_NAME]/info.plist`
|
|
54
|
+
|
|
53
55
|
```xml
|
|
54
56
|
<key>NSMicrophoneUsageDescription</key>
|
|
55
57
|
<string>This app requires microphone access in order to transcribe speech</string>
|
|
@@ -58,10 +60,13 @@ Add these lines to ```ios/[YOU_APP_NAME]/info.plist```
|
|
|
58
60
|
For tvOS, please note that the microphone is not supported.
|
|
59
61
|
|
|
60
62
|
### Android
|
|
61
|
-
|
|
63
|
+
|
|
64
|
+
Add the following line to `android/app/src/main/AndroidManifest.xml`
|
|
65
|
+
|
|
62
66
|
```xml
|
|
63
67
|
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
|
64
68
|
```
|
|
69
|
+
|
|
65
70
|
## Tips & Tricks
|
|
66
71
|
|
|
67
72
|
The [Tips & Tricks](docs/TIPS.md) document is a collection of tips and tricks for using `whisper.rn`.
|
|
@@ -83,24 +88,6 @@ const { result } = await promise
|
|
|
83
88
|
// result: (The inference text result from audio file)
|
|
84
89
|
```
|
|
85
90
|
|
|
86
|
-
Use realtime transcribe:
|
|
87
|
-
|
|
88
|
-
```js
|
|
89
|
-
const { stop, subscribe } = await whisperContext.transcribeRealtime(options)
|
|
90
|
-
|
|
91
|
-
subscribe(evt => {
|
|
92
|
-
const { isCapturing, data, processTime, recordingTime } = evt
|
|
93
|
-
console.log(
|
|
94
|
-
`Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
|
|
95
|
-
// The inference text result from audio record:
|
|
96
|
-
`Result: ${data.result}\n\n` +
|
|
97
|
-
`Process time: ${processTime}ms\n` +
|
|
98
|
-
`Recording time: ${recordingTime}ms`,
|
|
99
|
-
)
|
|
100
|
-
if (!isCapturing) console.log('Finished realtime transcribing')
|
|
101
|
-
})
|
|
102
|
-
```
|
|
103
|
-
|
|
104
91
|
## Voice Activity Detection (VAD)
|
|
105
92
|
|
|
106
93
|
Voice Activity Detection allows you to detect speech segments in audio data using the Silero VAD model.
|
|
@@ -157,7 +144,11 @@ const segments = await vadContext.detectSpeechData(base64AudioData, {
|
|
|
157
144
|
|
|
158
145
|
```typescript
|
|
159
146
|
segments.forEach((segment, index) => {
|
|
160
|
-
console.log(
|
|
147
|
+
console.log(
|
|
148
|
+
`Segment ${index + 1}: ${segment.t0.toFixed(2)}s - ${segment.t1.toFixed(
|
|
149
|
+
2,
|
|
150
|
+
)}s`,
|
|
151
|
+
)
|
|
161
152
|
console.log(`Duration: ${(segment.t1 - segment.t0).toFixed(2)}s`)
|
|
162
153
|
})
|
|
163
154
|
```
|
|
@@ -170,35 +161,57 @@ await vadContext.release()
|
|
|
170
161
|
await releaseAllWhisperVad()
|
|
171
162
|
```
|
|
172
163
|
|
|
173
|
-
|
|
164
|
+
## Realtime Transcription
|
|
165
|
+
|
|
166
|
+
The new `RealtimeTranscriber` provides enhanced realtime transcription with features like Voice Activity Detection (VAD), auto-slicing, and memory management.
|
|
174
167
|
|
|
175
|
-
Option 1 - Use options in transcribeRealtime:
|
|
176
168
|
```js
|
|
177
|
-
|
|
169
|
+
// If your RN packager is not enable package exports support, use whisper.rn/src/realtime-transcription
|
|
170
|
+
import { RealtimeTranscriber } from 'whisper.rn/realtime-transcription'
|
|
171
|
+
import { AudioPcmStreamAdapter } from 'whisper.rn/realtime-transcription/adapters'
|
|
172
|
+
import RNFS from 'react-native-fs' // or any compatible filesystem
|
|
178
173
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
audioSessionOnStopIos: 'restore', // Or an AudioSessionSettingIos
|
|
174
|
+
// Dependencies
|
|
175
|
+
const whisperContext = await initWhisper({
|
|
176
|
+
/* ... */
|
|
177
|
+
})
|
|
178
|
+
const vadContext = await initWhisperVad({
|
|
179
|
+
/* ... */
|
|
186
180
|
})
|
|
181
|
+
const audioStream = new AudioPcmStreamAdapter() // requires @fugood/react-native-audio-pcm-stream
|
|
182
|
+
|
|
183
|
+
// Create transcriber
|
|
184
|
+
const transcriber = new RealtimeTranscriber(
|
|
185
|
+
{ whisperContext, vadContext, audioStream, fs: RNFS },
|
|
186
|
+
{
|
|
187
|
+
audioSliceSec: 30,
|
|
188
|
+
vadPreset: 'default',
|
|
189
|
+
autoSliceOnSpeechEnd: true,
|
|
190
|
+
transcribeOptions: { language: 'en' },
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
onTranscribe: (event) => console.log('Transcription:', event.data?.result),
|
|
194
|
+
onVad: (event) => console.log('VAD:', event.type, event.confidence),
|
|
195
|
+
onStatusChange: (isActive) =>
|
|
196
|
+
console.log('Status:', isActive ? 'ACTIVE' : 'INACTIVE'),
|
|
197
|
+
onError: (error) => console.error('Error:', error),
|
|
198
|
+
},
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
// Start/stop transcription
|
|
202
|
+
await transcriber.start()
|
|
203
|
+
await transcriber.stop()
|
|
187
204
|
```
|
|
188
205
|
|
|
189
|
-
|
|
190
|
-
```js
|
|
191
|
-
import { AudioSessionIos } from 'whisper.rn'
|
|
206
|
+
**Dependencies:**
|
|
192
207
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
)
|
|
196
|
-
await AudioSessionIos.setMode(AudioSessionIos.Mode.Default)
|
|
197
|
-
await AudioSessionIos.setActive(true)
|
|
198
|
-
// Then you can start do recording
|
|
199
|
-
```
|
|
208
|
+
- `@fugood/react-native-audio-pcm-stream` for `AudioPcmStreamAdapter`
|
|
209
|
+
- Compatible filesystem module (e.g., `react-native-fs`). See [filesystem interface](src/utils/WavFileWriter.ts#L9-L16) for TypeScript definition
|
|
200
210
|
|
|
201
|
-
|
|
211
|
+
**Custom Audio Adapters:**
|
|
212
|
+
You can create custom audio stream adapters by implementing the [AudioStreamInterface](src/realtime-transcription/types.ts#L21-L30). This allows integration with different audio sources or custom audio processing pipelines.
|
|
213
|
+
|
|
214
|
+
**Example:** See [complete example](example/src/RealtimeTranscriber.tsx) for full implementation including file simulation and UI.
|
|
202
215
|
|
|
203
216
|
Please visit the [Documentation](docs/) for more details.
|
|
204
217
|
|
|
@@ -213,8 +226,10 @@ const whisperContext = await initWhisper({
|
|
|
213
226
|
filePath: require('../assets/ggml-tiny.en.bin'),
|
|
214
227
|
})
|
|
215
228
|
|
|
216
|
-
const { stop, promise } =
|
|
217
|
-
|
|
229
|
+
const { stop, promise } = whisperContext.transcribe(
|
|
230
|
+
require('../assets/sample.wav'),
|
|
231
|
+
options,
|
|
232
|
+
)
|
|
218
233
|
|
|
219
234
|
// ...
|
|
220
235
|
```
|
|
@@ -233,18 +248,19 @@ module.exports = {
|
|
|
233
248
|
...defaultAssetExts,
|
|
234
249
|
'bin', // whisper.rn: ggml model binary
|
|
235
250
|
'mil', // whisper.rn: CoreML model asset
|
|
236
|
-
]
|
|
251
|
+
],
|
|
237
252
|
},
|
|
238
253
|
}
|
|
239
254
|
```
|
|
240
255
|
|
|
241
256
|
Please note that:
|
|
257
|
+
|
|
242
258
|
- It will significantly increase the size of the app in release mode.
|
|
243
259
|
- The RN packager is not allowed file size larger than 2GB, so it not able to use original f16 `large` model (2.9GB), you can use quantized models instead.
|
|
244
260
|
|
|
245
261
|
## Core ML support
|
|
246
262
|
|
|
247
|
-
|
|
263
|
+
**_Platform: iOS 15.0+, tvOS 15.0+_**
|
|
248
264
|
|
|
249
265
|
To use Core ML on iOS, you will need to have the Core ML model files.
|
|
250
266
|
|
|
@@ -301,9 +317,71 @@ Please follow the [Development Workflow section of contributing guide](./CONTRIB
|
|
|
301
317
|
We have provided a mock version of `whisper.rn` for testing purpose you can use on Jest:
|
|
302
318
|
|
|
303
319
|
```js
|
|
304
|
-
jest.mock('whisper.rn', () => require('whisper.rn/jest
|
|
320
|
+
jest.mock('whisper.rn', () => require('whisper.rn/jest-mock'))
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## Deprecated APIs
|
|
324
|
+
|
|
325
|
+
### `transcribeRealtime` (Deprecated)
|
|
326
|
+
|
|
327
|
+
> ⚠️ **Deprecated**: Use `RealtimeTranscriber` instead for enhanced features and better performance.
|
|
328
|
+
|
|
329
|
+
```js
|
|
330
|
+
const { stop, subscribe } = await whisperContext.transcribeRealtime(options)
|
|
331
|
+
|
|
332
|
+
subscribe((evt) => {
|
|
333
|
+
const { isCapturing, data, processTime, recordingTime } = evt
|
|
334
|
+
console.log(
|
|
335
|
+
`Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
|
|
336
|
+
`Result: ${data.result}\n\n` +
|
|
337
|
+
`Process time: ${processTime}ms\n` +
|
|
338
|
+
`Recording time: ${recordingTime}ms`,
|
|
339
|
+
)
|
|
340
|
+
if (!isCapturing) console.log('Finished realtime transcribing')
|
|
341
|
+
})
|
|
305
342
|
```
|
|
306
343
|
|
|
344
|
+
In iOS, You may need to change the Audio Session so that it can be used with other audio playback, or to optimize the quality of the recording. So we have provided AudioSession utilities for you:
|
|
345
|
+
|
|
346
|
+
Option 1 - Use options in transcribeRealtime:
|
|
347
|
+
|
|
348
|
+
```js
|
|
349
|
+
import { AudioSessionIos } from 'whisper.rn'
|
|
350
|
+
|
|
351
|
+
const { stop, subscribe } = await whisperContext.transcribeRealtime({
|
|
352
|
+
audioSessionOnStartIos: {
|
|
353
|
+
category: AudioSessionIos.Category.PlayAndRecord,
|
|
354
|
+
options: [AudioSessionIos.CategoryOption.MixWithOthers],
|
|
355
|
+
mode: AudioSessionIos.Mode.Default,
|
|
356
|
+
},
|
|
357
|
+
audioSessionOnStopIos: 'restore', // Or an AudioSessionSettingIos
|
|
358
|
+
})
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
Option 2 - Manage the Audio Session in anywhere:
|
|
362
|
+
|
|
363
|
+
```js
|
|
364
|
+
import { AudioSessionIos } from 'whisper.rn'
|
|
365
|
+
|
|
366
|
+
await AudioSessionIos.setCategory(AudioSessionIos.Category.PlayAndRecord, [
|
|
367
|
+
AudioSessionIos.CategoryOption.MixWithOthers,
|
|
368
|
+
])
|
|
369
|
+
await AudioSessionIos.setMode(AudioSessionIos.Mode.Default)
|
|
370
|
+
await AudioSessionIos.setActive(true)
|
|
371
|
+
// Then you can start do recording
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
In Android, you may need to request the microphone permission by [`PermissionAndroid`](https://reactnative.dev/docs/permissionsandroid).
|
|
375
|
+
|
|
376
|
+
## Apps using `whisper.rn`
|
|
377
|
+
|
|
378
|
+
- [BRICKS](https://bricks.tools): Our product for building interactive signage in simple way. We provide LLM functions as Generator LLM/Assistant.
|
|
379
|
+
- ... (Any Contribution is welcome)
|
|
380
|
+
|
|
381
|
+
## Node.js binding
|
|
382
|
+
|
|
383
|
+
- [whisper.node](https://github.com/mybigday/whisper.node): An another Node.js binding of `whisper.cpp` but made API same as `whisper.rn`.
|
|
384
|
+
|
|
307
385
|
## Contributing
|
|
308
386
|
|
|
309
387
|
See the [contributing guide](CONTRIBUTING.md) to learn how to contribute to the repository and the development workflow.
|
package/android/build.gradle
CHANGED
|
@@ -110,6 +110,7 @@ build_library("rnwhisper" "generic" "")
|
|
|
110
110
|
|
|
111
111
|
if (${ANDROID_ABI} STREQUAL "arm64-v8a")
|
|
112
112
|
build_library("rnwhisper_v8fp16_va_2" "arm" "-march=armv8.2-a+fp16")
|
|
113
|
+
build_library("rnwhisper_v8" "arm" "-march=armv8-a")
|
|
113
114
|
elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
|
|
114
115
|
build_library("rnwhisper_vfpv4" "arm" "-mfpu=neon-vfpv4")
|
|
115
116
|
elseif (${ANDROID_ABI} STREQUAL "x86_64")
|
|
@@ -64,6 +64,10 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
public void installJSIBindings(Promise promise) {
|
|
67
|
+
if (!WhisperContext.isNativeLibraryLoaded()) {
|
|
68
|
+
promise.reject("Native library not loaded");
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
67
71
|
|
|
68
72
|
AsyncTask task = new AsyncTask<Void, Void, Void>() {
|
|
69
73
|
private Exception exception;
|
|
@@ -95,6 +99,37 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
95
99
|
tasks.put(task, "installJSIBindings");
|
|
96
100
|
}
|
|
97
101
|
|
|
102
|
+
public void toggleNativeLog(boolean enabled, Promise promise) {
|
|
103
|
+
if (!WhisperContext.isNativeLibraryLoaded()) {
|
|
104
|
+
promise.reject("Native library not loaded");
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
new AsyncTask<Void, Void, Boolean>() {
|
|
109
|
+
private Exception exception;
|
|
110
|
+
|
|
111
|
+
@Override
|
|
112
|
+
protected Boolean doInBackground(Void... voids) {
|
|
113
|
+
try {
|
|
114
|
+
WhisperContext.toggleNativeLog(reactContext, enabled);
|
|
115
|
+
return true;
|
|
116
|
+
} catch (Exception e) {
|
|
117
|
+
exception = e;
|
|
118
|
+
}
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
@Override
|
|
123
|
+
protected void onPostExecute(Boolean result) {
|
|
124
|
+
if (exception != null) {
|
|
125
|
+
promise.reject(exception);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
promise.resolve(result);
|
|
129
|
+
}
|
|
130
|
+
}.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
|
|
131
|
+
}
|
|
132
|
+
|
|
98
133
|
private int getResourceIdentifier(String filePath) {
|
|
99
134
|
int identifier = reactContext.getResources().getIdentifier(
|
|
100
135
|
filePath,
|
|
@@ -29,6 +29,29 @@ public class WhisperContext {
|
|
|
29
29
|
|
|
30
30
|
private static String loadedLibrary = "";
|
|
31
31
|
|
|
32
|
+
private static class NativeLogCallback {
|
|
33
|
+
DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
|
|
34
|
+
|
|
35
|
+
public NativeLogCallback(ReactApplicationContext reactContext) {
|
|
36
|
+
this.eventEmitter = reactContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
void emitNativeLog(String level, String text) {
|
|
40
|
+
WritableMap event = Arguments.createMap();
|
|
41
|
+
event.putString("level", level);
|
|
42
|
+
event.putString("text", text);
|
|
43
|
+
eventEmitter.emit("@RNWhisper_onNativeLog", event);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
static void toggleNativeLog(ReactApplicationContext reactContext, boolean enabled) {
|
|
48
|
+
if (enabled) {
|
|
49
|
+
setupLog(new NativeLogCallback(reactContext));
|
|
50
|
+
} else {
|
|
51
|
+
unsetLog();
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
32
55
|
private static final int SAMPLE_RATE = 16000;
|
|
33
56
|
private static final int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
|
|
34
57
|
private static final int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
|
|
@@ -454,6 +477,10 @@ public class WhisperContext {
|
|
|
454
477
|
Log.d(NAME, "Loading librnwhisper_v8fp16_va_2.so");
|
|
455
478
|
System.loadLibrary("rnwhisper_v8fp16_va_2");
|
|
456
479
|
loadedLibrary = "rnwhisper_v8fp16_va_2";
|
|
480
|
+
} else {
|
|
481
|
+
Log.d(NAME, "Loading librnwhisper_v8.so");
|
|
482
|
+
System.loadLibrary("rnwhisper_v8");
|
|
483
|
+
loadedLibrary = "rnwhisper_v8";
|
|
457
484
|
}
|
|
458
485
|
} else if (WhisperContext.isArmeabiV7a()) {
|
|
459
486
|
Log.d(NAME, "Loading librnwhisper_vfpv4.so");
|
|
@@ -468,6 +495,10 @@ public class WhisperContext {
|
|
|
468
495
|
}
|
|
469
496
|
}
|
|
470
497
|
|
|
498
|
+
public static boolean isNativeLibraryLoaded() {
|
|
499
|
+
return loadedLibrary != "";
|
|
500
|
+
}
|
|
501
|
+
|
|
471
502
|
public static boolean isArm64V8a() {
|
|
472
503
|
return Build.SUPPORTED_ABIS[0].equals("arm64-v8a");
|
|
473
504
|
}
|
|
@@ -571,4 +602,6 @@ public class WhisperContext {
|
|
|
571
602
|
// JSI Installation
|
|
572
603
|
protected static native void installJSIBindings(long runtimePtr, Object callInvokerHolder);
|
|
573
604
|
protected static native void cleanupJSIBindings();
|
|
605
|
+
protected static native void setupLog(NativeLogCallback logCallback);
|
|
606
|
+
protected static native void unsetLog();
|
|
574
607
|
}
|
package/android/src/main/jni.cpp
CHANGED
|
@@ -23,6 +23,64 @@
|
|
|
23
23
|
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
|
24
24
|
#define LOGW(...) __android_log_print(ANDROID_LOG_WARN, TAG, __VA_ARGS__)
|
|
25
25
|
|
|
26
|
+
struct log_callback_context {
|
|
27
|
+
JavaVM *jvm;
|
|
28
|
+
jobject callback;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
static void rnwhisper_log_callback_default(enum wsp_ggml_log_level level, const char * fmt, void * data) {
|
|
32
|
+
if (level == WSP_GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
|
|
33
|
+
else if (level == WSP_GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
|
|
34
|
+
else if (level == WSP_GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
|
|
35
|
+
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
static void rnwhisper_log_callback_to_j(enum wsp_ggml_log_level level, const char * text, void * data) {
|
|
39
|
+
const char* level_c = "";
|
|
40
|
+
if (level == WSP_GGML_LOG_LEVEL_ERROR) {
|
|
41
|
+
__android_log_print(ANDROID_LOG_ERROR, TAG, text, nullptr);
|
|
42
|
+
level_c = "error";
|
|
43
|
+
} else if (level == WSP_GGML_LOG_LEVEL_INFO) {
|
|
44
|
+
__android_log_print(ANDROID_LOG_INFO, TAG, text, nullptr);
|
|
45
|
+
level_c = "info";
|
|
46
|
+
} else if (level == WSP_GGML_LOG_LEVEL_WARN) {
|
|
47
|
+
__android_log_print(ANDROID_LOG_WARN, TAG, text, nullptr);
|
|
48
|
+
level_c = "warn";
|
|
49
|
+
} else {
|
|
50
|
+
__android_log_print(ANDROID_LOG_DEFAULT, TAG, text, nullptr);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
log_callback_context *cb_ctx = (log_callback_context *) data;
|
|
54
|
+
|
|
55
|
+
JNIEnv *env;
|
|
56
|
+
bool need_detach = false;
|
|
57
|
+
int getEnvResult = cb_ctx->jvm->GetEnv((void**)&env, JNI_VERSION_1_6);
|
|
58
|
+
|
|
59
|
+
if (getEnvResult == JNI_EDETACHED) {
|
|
60
|
+
if (cb_ctx->jvm->AttachCurrentThread(&env, nullptr) == JNI_OK) {
|
|
61
|
+
need_detach = true;
|
|
62
|
+
} else {
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
} else if (getEnvResult != JNI_OK) {
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
jobject callback = cb_ctx->callback;
|
|
70
|
+
jclass cb_class = env->GetObjectClass(callback);
|
|
71
|
+
jmethodID emitNativeLog = env->GetMethodID(cb_class, "emitNativeLog", "(Ljava/lang/String;Ljava/lang/String;)V");
|
|
72
|
+
|
|
73
|
+
jstring level_str = env->NewStringUTF(level_c);
|
|
74
|
+
jstring text_str = env->NewStringUTF(text);
|
|
75
|
+
env->CallVoidMethod(callback, emitNativeLog, level_str, text_str);
|
|
76
|
+
env->DeleteLocalRef(level_str);
|
|
77
|
+
env->DeleteLocalRef(text_str);
|
|
78
|
+
|
|
79
|
+
if (need_detach) {
|
|
80
|
+
cb_ctx->jvm->DetachCurrentThread();
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
26
84
|
static inline int min(int a, int b) {
|
|
27
85
|
return (a < b) ? a : b;
|
|
28
86
|
}
|
|
@@ -800,7 +858,30 @@ Java_com_rnwhisper_WhisperContext_cleanupJSIBindings(
|
|
|
800
858
|
JNIEnv *env,
|
|
801
859
|
jclass clazz
|
|
802
860
|
) {
|
|
861
|
+
UNUSED(env);
|
|
862
|
+
UNUSED(clazz);
|
|
803
863
|
rnwhisper_jsi::cleanupJSIBindings();
|
|
804
864
|
}
|
|
805
865
|
|
|
866
|
+
JNIEXPORT void JNICALL
|
|
867
|
+
Java_com_rnwhisper_WhisperContext_setupLog(JNIEnv *env, jobject thiz, jobject logCallback) {
|
|
868
|
+
UNUSED(thiz);
|
|
869
|
+
|
|
870
|
+
log_callback_context *cb_ctx = new log_callback_context;
|
|
871
|
+
|
|
872
|
+
JavaVM *jvm;
|
|
873
|
+
env->GetJavaVM(&jvm);
|
|
874
|
+
cb_ctx->jvm = jvm;
|
|
875
|
+
cb_ctx->callback = env->NewGlobalRef(logCallback);
|
|
876
|
+
|
|
877
|
+
whisper_log_set(rnwhisper_log_callback_to_j, cb_ctx);
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
JNIEXPORT void JNICALL
|
|
881
|
+
Java_com_rnwhisper_WhisperContext_unsetLog(JNIEnv *env, jobject thiz) {
|
|
882
|
+
UNUSED(env);
|
|
883
|
+
UNUSED(thiz);
|
|
884
|
+
whisper_log_set(rnwhisper_log_callback_default, NULL);
|
|
885
|
+
}
|
|
886
|
+
|
|
806
887
|
} // extern "C"
|
|
@@ -31,6 +31,11 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
|
|
|
31
31
|
rnwhisper.installJSIBindings(promise);
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
@ReactMethod
|
|
35
|
+
public void toggleNativeLog(boolean enabled, Promise promise) {
|
|
36
|
+
rnwhisper.toggleNativeLog(enabled, promise);
|
|
37
|
+
}
|
|
38
|
+
|
|
34
39
|
@Override
|
|
35
40
|
@NonNull
|
|
36
41
|
public String getName() {
|
|
@@ -31,6 +31,11 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
|
|
|
31
31
|
rnwhisper.installJSIBindings(promise);
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
@ReactMethod
|
|
35
|
+
public void toggleNativeLog(boolean enabled, Promise promise) {
|
|
36
|
+
rnwhisper.toggleNativeLog(enabled, promise);
|
|
37
|
+
}
|
|
38
|
+
|
|
34
39
|
@Override
|
|
35
40
|
@NonNull
|
|
36
41
|
public String getName() {
|
package/cpp/jsi/RNWhisperJSI.cpp
CHANGED
|
@@ -295,6 +295,38 @@ CallbackInfo extractCallbacks(Runtime& runtime, const Object& optionsObj) {
|
|
|
295
295
|
return info;
|
|
296
296
|
}
|
|
297
297
|
|
|
298
|
+
// Helper function to extract VAD parameters from options
|
|
299
|
+
whisper_vad_params extractVadParams(Runtime& runtime, const Object& optionsObj) {
|
|
300
|
+
whisper_vad_params vadParams = whisper_vad_default_params();
|
|
301
|
+
|
|
302
|
+
try {
|
|
303
|
+
auto propNames = optionsObj.getPropertyNames(runtime);
|
|
304
|
+
for (size_t i = 0; i < propNames.size(runtime); i++) {
|
|
305
|
+
auto propNameValue = propNames.getValueAtIndex(runtime, i);
|
|
306
|
+
std::string propName = propNameValue.getString(runtime).utf8(runtime);
|
|
307
|
+
Value propValue = optionsObj.getProperty(runtime, propNameValue.getString(runtime));
|
|
308
|
+
|
|
309
|
+
if (propName == "threshold" && propValue.isNumber()) {
|
|
310
|
+
vadParams.threshold = (float)propValue.getNumber();
|
|
311
|
+
} else if (propName == "minSpeechDurationMs" && propValue.isNumber()) {
|
|
312
|
+
vadParams.min_speech_duration_ms = (int)propValue.getNumber();
|
|
313
|
+
} else if (propName == "minSilenceDurationMs" && propValue.isNumber()) {
|
|
314
|
+
vadParams.min_silence_duration_ms = (int)propValue.getNumber();
|
|
315
|
+
} else if (propName == "maxSpeechDurationS" && propValue.isNumber()) {
|
|
316
|
+
vadParams.max_speech_duration_s = (float)propValue.getNumber();
|
|
317
|
+
} else if (propName == "speechPadMs" && propValue.isNumber()) {
|
|
318
|
+
vadParams.speech_pad_ms = (int)propValue.getNumber();
|
|
319
|
+
} else if (propName == "samplesOverlap" && propValue.isNumber()) {
|
|
320
|
+
vadParams.samples_overlap = (float)propValue.getNumber();
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
} catch (...) {
|
|
324
|
+
// Ignore parameter extraction errors
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return vadParams;
|
|
328
|
+
}
|
|
329
|
+
|
|
298
330
|
// Helper function to create segments array
|
|
299
331
|
Array createSegmentsArray(Runtime& runtime, struct whisper_context* ctx, int offset) {
|
|
300
332
|
int n_segments = whisper_full_n_segments(ctx);
|
|
@@ -355,10 +387,13 @@ Value createPromiseTask(
|
|
|
355
387
|
|
|
356
388
|
whisper_full_params params = {};
|
|
357
389
|
CallbackInfo callbackInfo = {};
|
|
390
|
+
whisper_vad_params vadParams = {};
|
|
358
391
|
if (functionName == "whisperTranscribeData") {
|
|
359
392
|
params = createFullParamsFromJSI(runtime, optionsObj);
|
|
360
393
|
// Extract data from optionsObj before lambda capture
|
|
361
394
|
callbackInfo = extractCallbacks(runtime, optionsObj);
|
|
395
|
+
} else if (functionName == "whisperVadDetectSpeech") {
|
|
396
|
+
vadParams = extractVadParams(runtime, optionsObj);
|
|
362
397
|
}
|
|
363
398
|
|
|
364
399
|
// Create promise
|
|
@@ -368,7 +403,7 @@ Value createPromiseTask(
|
|
|
368
403
|
runtime,
|
|
369
404
|
PropNameID::forAscii(runtime, ""),
|
|
370
405
|
2, // resolve, reject
|
|
371
|
-
[contextId, audioResult, params, callbackInfo, task, callInvoker, functionName](Runtime& runtime, const Value& thisValue, const Value* arguments, size_t count) -> Value {
|
|
406
|
+
[contextId, audioResult, params, callbackInfo, vadParams, task, callInvoker, functionName](Runtime& runtime, const Value& thisValue, const Value* arguments, size_t count) -> Value {
|
|
372
407
|
if (count != 2) {
|
|
373
408
|
throw JSError(runtime, "Promise executor expects 2 arguments (resolve, reject)");
|
|
374
409
|
}
|
|
@@ -379,10 +414,10 @@ Value createPromiseTask(
|
|
|
379
414
|
|
|
380
415
|
// Execute task in ThreadPool
|
|
381
416
|
auto future = getWhisperThreadPool().enqueue([
|
|
382
|
-
contextId, audioResult, params, callbackInfo, task, resolvePtr, rejectPtr, callInvoker, safeRuntime, functionName]() {
|
|
417
|
+
contextId, audioResult, params, callbackInfo, vadParams, task, resolvePtr, rejectPtr, callInvoker, safeRuntime, functionName]() {
|
|
383
418
|
|
|
384
419
|
try {
|
|
385
|
-
task(contextId, audioResult, params, callbackInfo, resolvePtr, rejectPtr, callInvoker, safeRuntime);
|
|
420
|
+
task(contextId, audioResult, params, callbackInfo, vadParams, resolvePtr, rejectPtr, callInvoker, safeRuntime);
|
|
386
421
|
} catch (...) {
|
|
387
422
|
callInvoker->invokeAsync([rejectPtr, safeRuntime, functionName]() {
|
|
388
423
|
auto& runtime = *safeRuntime;
|
|
@@ -413,7 +448,7 @@ void installJSIBindings(
|
|
|
413
448
|
try {
|
|
414
449
|
return createPromiseTask<whisper_context>(
|
|
415
450
|
runtime, "whisperTranscribeData", callInvoker, arguments, count,
|
|
416
|
-
[](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo,
|
|
451
|
+
[](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo, const whisper_vad_params& vadParams,
|
|
417
452
|
std::shared_ptr<Function> resolvePtr, std::shared_ptr<Function> rejectPtr,
|
|
418
453
|
std::shared_ptr<facebook::react::CallInvoker> callInvoker,
|
|
419
454
|
std::shared_ptr<Runtime> safeRuntime) {
|
|
@@ -566,7 +601,7 @@ void installJSIBindings(
|
|
|
566
601
|
try {
|
|
567
602
|
return createPromiseTask<whisper_vad_context>(
|
|
568
603
|
runtime, "whisperVadDetectSpeech", callInvoker, arguments, count,
|
|
569
|
-
[](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo,
|
|
604
|
+
[](int contextId, const AudioData& audioResult, const whisper_full_params& params, const CallbackInfo& callbackInfo, const whisper_vad_params& vadParams,
|
|
570
605
|
std::shared_ptr<Function> resolvePtr, std::shared_ptr<Function> rejectPtr,
|
|
571
606
|
std::shared_ptr<facebook::react::CallInvoker> callInvoker,
|
|
572
607
|
std::shared_ptr<Runtime> safeRuntime) {
|
|
@@ -600,7 +635,8 @@ void installJSIBindings(
|
|
|
600
635
|
bool isSpeech = whisper_vad_detect_speech(vadContext, audioResult.data.data(), audioResult.count);
|
|
601
636
|
logInfo("VAD detection result: %s", isSpeech ? "speech" : "no speech");
|
|
602
637
|
|
|
603
|
-
struct whisper_vad_params vad_params =
|
|
638
|
+
struct whisper_vad_params vad_params = vadParams;
|
|
639
|
+
|
|
604
640
|
struct whisper_vad_segments* segments = nullptr;
|
|
605
641
|
if (isSpeech) {
|
|
606
642
|
segments = whisper_vad_segments_from_probs(vadContext, vad_params);
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -24,6 +24,16 @@ RCT_EXPORT_MODULE()
|
|
|
24
24
|
return NO;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
+
RCT_EXPORT_METHOD(toggleNativeLog:(BOOL)enabled) {
|
|
28
|
+
void (^onEmitLog)(NSString *level, NSString *text) = nil;
|
|
29
|
+
if (enabled) {
|
|
30
|
+
onEmitLog = ^(NSString *level, NSString *text) {
|
|
31
|
+
[self sendEventWithName:@"@RNWhisper_onNativeLog" body:@{ @"level": level, @"text": text }];
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
[RNWhisperContext toggleNativeLog:enabled onEmitLog:onEmitLog];
|
|
35
|
+
}
|
|
36
|
+
|
|
27
37
|
- (NSDictionary *)constantsToExport
|
|
28
38
|
{
|
|
29
39
|
return @{
|
|
@@ -107,6 +117,7 @@ RCT_REMAP_METHOD(initContext,
|
|
|
107
117
|
@"@RNWhisper_onTranscribeNewSegments",
|
|
108
118
|
@"@RNWhisper_onRealtimeTranscribe",
|
|
109
119
|
@"@RNWhisper_onRealtimeTranscribeEnd",
|
|
120
|
+
@"@RNWhisper_onNativeLog",
|
|
110
121
|
];
|
|
111
122
|
}
|
|
112
123
|
|
package/ios/RNWhisperContext.h
CHANGED
|
@@ -47,6 +47,7 @@ typedef struct {
|
|
|
47
47
|
bool isMetalEnabled;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
+ (void)toggleNativeLog:(BOOL)enabled onEmitLog:(void (^)(NSString *level, NSString *text))onEmitLog;
|
|
50
51
|
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML noMetal:(BOOL)noMetal useFlashAttn:(BOOL)useFlashAttn;
|
|
51
52
|
- (bool)isMetalEnabled;
|
|
52
53
|
- (NSString *)reasonNoMetal;
|