whisper.rn 0.4.0-rc.4 → 0.4.0-rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +57 -134
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +188 -112
- package/cpp/README.md +1 -1
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +4 -2
- package/cpp/ggml-alloc.c +55 -19
- package/cpp/ggml-alloc.h +8 -1
- package/cpp/ggml-backend-impl.h +46 -21
- package/cpp/ggml-backend.c +563 -156
- package/cpp/ggml-backend.h +62 -17
- package/cpp/ggml-impl.h +1 -1
- package/cpp/ggml-metal-whisper.metal +2444 -359
- package/cpp/ggml-metal.h +7 -1
- package/cpp/ggml-metal.m +1105 -197
- package/cpp/ggml-quants.c +66 -61
- package/cpp/ggml-quants.h +40 -40
- package/cpp/ggml.c +1040 -1590
- package/cpp/ggml.h +109 -30
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +143 -59
- package/cpp/rn-whisper.h +48 -15
- package/cpp/whisper.cpp +1635 -928
- package/cpp/whisper.h +55 -10
- package/ios/RNWhisper.mm +7 -7
- package/ios/RNWhisperAudioUtils.h +0 -2
- package/ios/RNWhisperAudioUtils.m +0 -56
- package/ios/RNWhisperContext.h +3 -11
- package/ios/RNWhisperContext.mm +68 -137
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/index.d.ts +5 -0
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +6 -5
- package/src/index.ts +5 -0
- package/src/version.json +1 -1
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
package/README.md
CHANGED
|
@@ -25,19 +25,19 @@ npm install whisper.rn
|
|
|
25
25
|
|
|
26
26
|
Please re-run `npx pod-install` again.
|
|
27
27
|
|
|
28
|
-
#### Android
|
|
29
|
-
|
|
30
28
|
If you want to use `medium` or `large` model, the [Extended Virtual Addressing](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_extended-virtual-addressing) capability is recommended to enable on iOS project.
|
|
31
29
|
|
|
32
|
-
|
|
30
|
+
#### Android
|
|
33
31
|
|
|
34
|
-
|
|
32
|
+
Add proguard rule if it's enabled in project (android/app/proguard-rules.pro):
|
|
35
33
|
|
|
36
34
|
```proguard
|
|
37
35
|
# whisper.rn
|
|
38
36
|
-keep class com.rnwhisper.** { *; }
|
|
39
37
|
```
|
|
40
38
|
|
|
39
|
+
For build, it's recommended to use `ndkVersion = "24.0.8215888"` (or above) in your root project build configuration for Apple Silicon Macs. Otherwise please follow this trobleshooting [issue](./TROUBLESHOOTING.md#android-got-build-error-unknown-host-cpu-architecture-arm64-on-apple-silicon-macs).
|
|
40
|
+
|
|
41
41
|
#### Expo
|
|
42
42
|
|
|
43
43
|
You will need to prebuild the project before using it. See [Expo guide](https://docs.expo.io/guides/using-libraries/#using-a-library-in-a-expo-project) for more details.
|
|
@@ -91,7 +91,7 @@ subscribe(evt => {
|
|
|
91
91
|
console.log(
|
|
92
92
|
`Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
|
|
93
93
|
// The inference text result from audio record:
|
|
94
|
-
`Result: ${data.result}\n\n` +
|
|
94
|
+
`Result: ${data.result}\n\n` +
|
|
95
95
|
`Process time: ${processTime}ms\n` +
|
|
96
96
|
`Recording time: ${recordingTime}ms`,
|
|
97
97
|
)
|
|
@@ -220,7 +220,7 @@ In real world, we recommended to split the asset imports into another platform s
|
|
|
220
220
|
|
|
221
221
|
The example app provide a simple UI for testing the functions.
|
|
222
222
|
|
|
223
|
-
Used Whisper model: `tiny.en` in https://huggingface.co/ggerganov/whisper.cpp
|
|
223
|
+
Used Whisper model: `tiny.en` in https://huggingface.co/ggerganov/whisper.cpp
|
|
224
224
|
Sample file: `jfk.wav` in https://github.com/ggerganov/whisper.cpp/tree/master/samples
|
|
225
225
|
|
|
226
226
|
Please follow the [Development Workflow section of contributing guide](./CONTRIBUTING.md#development-workflow) to run the example app.
|
package/android/build.gradle
CHANGED
|
@@ -36,6 +36,10 @@ def reactNativeArchitectures() {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
android {
|
|
39
|
+
def agpVersion = com.android.Version.ANDROID_GRADLE_PLUGIN_VERSION
|
|
40
|
+
if (agpVersion.tokenize('.')[0].toInteger() >= 7) {
|
|
41
|
+
namespace "com.rnwhisper"
|
|
42
|
+
}
|
|
39
43
|
ndkVersion getExtOrDefault("ndkVersion")
|
|
40
44
|
compileSdkVersion getExtOrIntegerDefault("compileSdkVersion")
|
|
41
45
|
|
|
@@ -12,6 +12,7 @@ set(
|
|
|
12
12
|
${RNWHISPER_LIB_DIR}/ggml-backend.c
|
|
13
13
|
${RNWHISPER_LIB_DIR}/ggml-quants.c
|
|
14
14
|
${RNWHISPER_LIB_DIR}/whisper.cpp
|
|
15
|
+
${RNWHISPER_LIB_DIR}/rn-audioutils.cpp
|
|
15
16
|
${RNWHISPER_LIB_DIR}/rn-whisper.cpp
|
|
16
17
|
${CMAKE_SOURCE_DIR}/jni.cpp
|
|
17
18
|
)
|
|
@@ -33,6 +34,10 @@ function(build_library target_name)
|
|
|
33
34
|
target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
|
|
34
35
|
endif ()
|
|
35
36
|
|
|
37
|
+
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
|
38
|
+
target_compile_options(${target_name} PRIVATE -DRNWHISPER_ANDROID_ENABLE_LOGGING)
|
|
39
|
+
endif ()
|
|
40
|
+
|
|
36
41
|
# NOTE: If you want to debug the native code, you can uncomment if and endif
|
|
37
42
|
# if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
|
38
43
|
|
|
@@ -2,14 +2,10 @@ package com.rnwhisper;
|
|
|
2
2
|
|
|
3
3
|
import android.util.Log;
|
|
4
4
|
|
|
5
|
-
import java.util.ArrayList;
|
|
6
|
-
import java.lang.StringBuilder;
|
|
7
5
|
import java.io.IOException;
|
|
8
6
|
import java.io.FileReader;
|
|
9
7
|
import java.io.ByteArrayOutputStream;
|
|
10
8
|
import java.io.File;
|
|
11
|
-
import java.io.FileOutputStream;
|
|
12
|
-
import java.io.DataOutputStream;
|
|
13
9
|
import java.io.IOException;
|
|
14
10
|
import java.io.InputStream;
|
|
15
11
|
import java.nio.ByteBuffer;
|
|
@@ -19,82 +15,6 @@ import java.nio.ShortBuffer;
|
|
|
19
15
|
public class AudioUtils {
|
|
20
16
|
private static final String NAME = "RNWhisperAudioUtils";
|
|
21
17
|
|
|
22
|
-
private static final int SAMPLE_RATE = 16000;
|
|
23
|
-
|
|
24
|
-
private static byte[] shortToByte(short[] shortInts) {
|
|
25
|
-
int j = 0;
|
|
26
|
-
int length = shortInts.length;
|
|
27
|
-
byte[] byteData = new byte[length * 2];
|
|
28
|
-
for (int i = 0; i < length; i++) {
|
|
29
|
-
byteData[j++] = (byte) (shortInts[i] >>> 8);
|
|
30
|
-
byteData[j++] = (byte) (shortInts[i] >>> 0);
|
|
31
|
-
}
|
|
32
|
-
return byteData;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
public static byte[] concatShortBuffers(ArrayList<short[]> buffers) {
|
|
36
|
-
int totalLength = 0;
|
|
37
|
-
for (int i = 0; i < buffers.size(); i++) {
|
|
38
|
-
totalLength += buffers.get(i).length;
|
|
39
|
-
}
|
|
40
|
-
byte[] result = new byte[totalLength * 2];
|
|
41
|
-
int offset = 0;
|
|
42
|
-
for (int i = 0; i < buffers.size(); i++) {
|
|
43
|
-
byte[] bytes = shortToByte(buffers.get(i));
|
|
44
|
-
System.arraycopy(bytes, 0, result, offset, bytes.length);
|
|
45
|
-
offset += bytes.length;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
return result;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
private static byte[] removeTrailingZeros(byte[] audioData) {
|
|
52
|
-
int i = audioData.length - 1;
|
|
53
|
-
while (i >= 0 && audioData[i] == 0) {
|
|
54
|
-
--i;
|
|
55
|
-
}
|
|
56
|
-
byte[] newData = new byte[i + 1];
|
|
57
|
-
System.arraycopy(audioData, 0, newData, 0, i + 1);
|
|
58
|
-
return newData;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
public static void saveWavFile(byte[] rawData, String audioOutputFile) throws IOException {
|
|
62
|
-
Log.d(NAME, "call saveWavFile");
|
|
63
|
-
rawData = removeTrailingZeros(rawData);
|
|
64
|
-
DataOutputStream output = null;
|
|
65
|
-
try {
|
|
66
|
-
output = new DataOutputStream(new FileOutputStream(audioOutputFile));
|
|
67
|
-
// WAVE header
|
|
68
|
-
// see http://ccrma.stanford.edu/courses/422/projects/WaveFormat/
|
|
69
|
-
output.writeBytes("RIFF"); // chunk id
|
|
70
|
-
output.writeInt(Integer.reverseBytes(36 + rawData.length)); // chunk size
|
|
71
|
-
output.writeBytes("WAVE"); // format
|
|
72
|
-
output.writeBytes("fmt "); // subchunk 1 id
|
|
73
|
-
output.writeInt(Integer.reverseBytes(16)); // subchunk 1 size
|
|
74
|
-
output.writeShort(Short.reverseBytes((short) 1)); // audio format (1 = PCM)
|
|
75
|
-
output.writeShort(Short.reverseBytes((short) 1)); // number of channels
|
|
76
|
-
output.writeInt(Integer.reverseBytes(SAMPLE_RATE)); // sample rate
|
|
77
|
-
output.writeInt(Integer.reverseBytes(SAMPLE_RATE * 2)); // byte rate
|
|
78
|
-
output.writeShort(Short.reverseBytes((short) 2)); // block align
|
|
79
|
-
output.writeShort(Short.reverseBytes((short) 16)); // bits per sample
|
|
80
|
-
output.writeBytes("data"); // subchunk 2 id
|
|
81
|
-
output.writeInt(Integer.reverseBytes(rawData.length)); // subchunk 2 size
|
|
82
|
-
// Audio data (conversion big endian -> little endian)
|
|
83
|
-
short[] shorts = new short[rawData.length / 2];
|
|
84
|
-
ByteBuffer.wrap(rawData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
|
85
|
-
ByteBuffer bytes = ByteBuffer.allocate(shorts.length * 2);
|
|
86
|
-
for (short s : shorts) {
|
|
87
|
-
bytes.putShort(s);
|
|
88
|
-
}
|
|
89
|
-
Log.d(NAME, "writing audio file: " + audioOutputFile);
|
|
90
|
-
output.write(bytes.array());
|
|
91
|
-
} finally {
|
|
92
|
-
if (output != null) {
|
|
93
|
-
output.close();
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
18
|
public static float[] decodeWaveFile(InputStream inputStream) throws IOException {
|
|
99
19
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
|
100
20
|
byte[] buffer = new byte[1024];
|
|
@@ -42,7 +42,6 @@ public class WhisperContext {
|
|
|
42
42
|
private AudioRecord recorder = null;
|
|
43
43
|
private int bufferSize;
|
|
44
44
|
private int nSamplesTranscribing = 0;
|
|
45
|
-
private ArrayList<short[]> shortBufferSlices;
|
|
46
45
|
// Remember number of samples in each slice
|
|
47
46
|
private ArrayList<Integer> sliceNSamples;
|
|
48
47
|
// Current buffer slice index
|
|
@@ -66,7 +65,6 @@ public class WhisperContext {
|
|
|
66
65
|
}
|
|
67
66
|
|
|
68
67
|
private void rewind() {
|
|
69
|
-
shortBufferSlices = null;
|
|
70
68
|
sliceNSamples = null;
|
|
71
69
|
sliceIndex = 0;
|
|
72
70
|
transcribeSliceIndex = 0;
|
|
@@ -79,41 +77,14 @@ public class WhisperContext {
|
|
|
79
77
|
fullHandler = null;
|
|
80
78
|
}
|
|
81
79
|
|
|
82
|
-
private boolean vad(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
int vadMs = options.hasKey("vadMs") ? options.getInt("vadMs") : 2000;
|
|
86
|
-
if (vadMs < 2000) vadMs = 2000;
|
|
87
|
-
int sampleSize = (int) (SAMPLE_RATE * vadMs / 1000);
|
|
88
|
-
if (nSamples + n > sampleSize) {
|
|
89
|
-
int start = nSamples + n - sampleSize;
|
|
90
|
-
float[] audioData = new float[sampleSize];
|
|
91
|
-
for (int i = 0; i < sampleSize; i++) {
|
|
92
|
-
audioData[i] = shortBuffer[i + start] / 32768.0f;
|
|
93
|
-
}
|
|
94
|
-
float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f;
|
|
95
|
-
float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f;
|
|
96
|
-
isSpeech = vadSimple(audioData, sampleSize, vadThold, vadFreqThold);
|
|
97
|
-
} else {
|
|
98
|
-
isSpeech = false;
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
return isSpeech;
|
|
80
|
+
private boolean vad(int sliceIndex, int nSamples, int n) {
|
|
81
|
+
if (isTranscribing) return true;
|
|
82
|
+
return vadSimple(jobId, sliceIndex, nSamples, n);
|
|
102
83
|
}
|
|
103
84
|
|
|
104
|
-
private void finishRealtimeTranscribe(
|
|
105
|
-
String audioOutputPath = options.hasKey("audioOutputPath") ? options.getString("audioOutputPath") : null;
|
|
106
|
-
if (audioOutputPath != null) {
|
|
107
|
-
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
108
|
-
Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
|
|
109
|
-
try {
|
|
110
|
-
AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
|
|
111
|
-
} catch (IOException e) {
|
|
112
|
-
Log.e(NAME, "Error saving wav file: " + e.getMessage());
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
85
|
+
private void finishRealtimeTranscribe(WritableMap result) {
|
|
116
86
|
emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
|
|
87
|
+
finishRealtimeTranscribeJob(jobId, context, sliceNSamples.stream().mapToInt(i -> i).toArray());
|
|
117
88
|
}
|
|
118
89
|
|
|
119
90
|
public int startRealtimeTranscribe(int jobId, ReadableMap options) {
|
|
@@ -135,16 +106,15 @@ public class WhisperContext {
|
|
|
135
106
|
|
|
136
107
|
int realtimeAudioSec = options.hasKey("realtimeAudioSec") ? options.getInt("realtimeAudioSec") : 0;
|
|
137
108
|
final int audioSec = realtimeAudioSec > 0 ? realtimeAudioSec : DEFAULT_MAX_AUDIO_SEC;
|
|
138
|
-
|
|
139
109
|
int realtimeAudioSliceSec = options.hasKey("realtimeAudioSliceSec") ? options.getInt("realtimeAudioSliceSec") : 0;
|
|
140
110
|
final int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < audioSec ? realtimeAudioSliceSec : audioSec;
|
|
141
|
-
|
|
142
111
|
isUseSlices = audioSliceSec < audioSec;
|
|
143
112
|
|
|
144
|
-
|
|
113
|
+
double realtimeAudioMinSec = options.hasKey("realtimeAudioMinSec") ? options.getDouble("realtimeAudioMinSec") : 0;
|
|
114
|
+
final double audioMinSec = realtimeAudioMinSec > 0.5 && realtimeAudioMinSec <= audioSliceSec ? realtimeAudioMinSec : 1;
|
|
115
|
+
|
|
116
|
+
createRealtimeTranscribeJob(jobId, context, options);
|
|
145
117
|
|
|
146
|
-
shortBufferSlices = new ArrayList<short[]>();
|
|
147
|
-
shortBufferSlices.add(new short[audioSliceSec * SAMPLE_RATE]);
|
|
148
118
|
sliceNSamples = new ArrayList<Integer>();
|
|
149
119
|
sliceNSamples.add(0);
|
|
150
120
|
|
|
@@ -175,49 +145,43 @@ public class WhisperContext {
|
|
|
175
145
|
nSamples == nSamplesTranscribing &&
|
|
176
146
|
sliceIndex == transcribeSliceIndex
|
|
177
147
|
) {
|
|
178
|
-
finishRealtimeTranscribe(
|
|
148
|
+
finishRealtimeTranscribe(Arguments.createMap());
|
|
179
149
|
} else if (!isTranscribing) {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
finishRealtimeTranscribe(options, Arguments.createMap());
|
|
150
|
+
boolean isSamplesEnough = nSamples / SAMPLE_RATE >= audioMinSec;
|
|
151
|
+
if (!isSamplesEnough || !vad(sliceIndex, nSamples, 0)) {
|
|
152
|
+
finishRealtimeTranscribe(Arguments.createMap());
|
|
184
153
|
break;
|
|
185
154
|
}
|
|
186
155
|
isTranscribing = true;
|
|
187
|
-
fullTranscribeSamples(
|
|
156
|
+
fullTranscribeSamples(true);
|
|
188
157
|
}
|
|
189
158
|
break;
|
|
190
159
|
}
|
|
191
160
|
|
|
192
161
|
// Append to buffer
|
|
193
|
-
short[] shortBuffer = shortBufferSlices.get(sliceIndex);
|
|
194
162
|
if (nSamples + n > audioSliceSec * SAMPLE_RATE) {
|
|
195
163
|
Log.d(NAME, "next slice");
|
|
196
164
|
|
|
197
165
|
sliceIndex++;
|
|
198
166
|
nSamples = 0;
|
|
199
|
-
shortBuffer = new short[audioSliceSec * SAMPLE_RATE];
|
|
200
|
-
shortBufferSlices.add(shortBuffer);
|
|
201
167
|
sliceNSamples.add(0);
|
|
202
168
|
}
|
|
169
|
+
putPcmData(jobId, buffer, sliceIndex, nSamples, n);
|
|
203
170
|
|
|
204
|
-
|
|
205
|
-
shortBuffer[nSamples + i] = buffer[i];
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
boolean isSpeech = vad(options, shortBuffer, nSamples, n);
|
|
171
|
+
boolean isSpeech = vad(sliceIndex, nSamples, n);
|
|
209
172
|
|
|
210
173
|
nSamples += n;
|
|
211
174
|
sliceNSamples.set(sliceIndex, nSamples);
|
|
212
175
|
|
|
213
|
-
|
|
176
|
+
boolean isSamplesEnough = nSamples / SAMPLE_RATE >= audioMinSec;
|
|
177
|
+
if (!isSamplesEnough || !isSpeech) continue;
|
|
214
178
|
|
|
215
179
|
if (!isTranscribing && nSamples > SAMPLE_RATE / 2) {
|
|
216
180
|
isTranscribing = true;
|
|
217
181
|
fullHandler = new Thread(new Runnable() {
|
|
218
182
|
@Override
|
|
219
183
|
public void run() {
|
|
220
|
-
fullTranscribeSamples(
|
|
184
|
+
fullTranscribeSamples(false);
|
|
221
185
|
}
|
|
222
186
|
});
|
|
223
187
|
fullHandler.start();
|
|
@@ -228,7 +192,7 @@ public class WhisperContext {
|
|
|
228
192
|
}
|
|
229
193
|
|
|
230
194
|
if (!isTranscribing) {
|
|
231
|
-
finishRealtimeTranscribe(
|
|
195
|
+
finishRealtimeTranscribe(Arguments.createMap());
|
|
232
196
|
}
|
|
233
197
|
if (fullHandler != null) {
|
|
234
198
|
fullHandler.join(); // Wait for full transcribe to finish
|
|
@@ -246,26 +210,16 @@ public class WhisperContext {
|
|
|
246
210
|
return state;
|
|
247
211
|
}
|
|
248
212
|
|
|
249
|
-
private void fullTranscribeSamples(
|
|
213
|
+
private void fullTranscribeSamples(boolean skipCapturingCheck) {
|
|
250
214
|
int nSamplesOfIndex = sliceNSamples.get(transcribeSliceIndex);
|
|
251
215
|
|
|
252
216
|
if (!isCapturing && !skipCapturingCheck) return;
|
|
253
217
|
|
|
254
|
-
short[] shortBuffer = shortBufferSlices.get(transcribeSliceIndex);
|
|
255
|
-
int nSamples = sliceNSamples.get(transcribeSliceIndex);
|
|
256
|
-
|
|
257
218
|
nSamplesTranscribing = nSamplesOfIndex;
|
|
258
|
-
|
|
259
|
-
// convert I16 to F32
|
|
260
|
-
float[] nSamplesBuffer32 = new float[nSamplesTranscribing];
|
|
261
|
-
for (int i = 0; i < nSamplesTranscribing; i++) {
|
|
262
|
-
nSamplesBuffer32[i] = shortBuffer[i] / 32768.0f;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
219
|
Log.d(NAME, "Start transcribing realtime: " + nSamplesTranscribing);
|
|
266
220
|
|
|
267
221
|
int timeStart = (int) System.currentTimeMillis();
|
|
268
|
-
int code =
|
|
222
|
+
int code = fullWithJob(jobId, context, transcribeSliceIndex, nSamplesTranscribing);
|
|
269
223
|
int timeEnd = (int) System.currentTimeMillis();
|
|
270
224
|
int timeRecording = (int) (nSamplesTranscribing / SAMPLE_RATE * 1000);
|
|
271
225
|
|
|
@@ -302,7 +256,7 @@ public class WhisperContext {
|
|
|
302
256
|
if (isStopped && !continueNeeded) {
|
|
303
257
|
payload.putBoolean("isCapturing", false);
|
|
304
258
|
payload.putBoolean("isStoppedByAction", isStoppedByAction);
|
|
305
|
-
finishRealtimeTranscribe(
|
|
259
|
+
finishRealtimeTranscribe(payload);
|
|
306
260
|
} else if (code == 0) {
|
|
307
261
|
payload.putBoolean("isCapturing", true);
|
|
308
262
|
emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
|
|
@@ -313,7 +267,7 @@ public class WhisperContext {
|
|
|
313
267
|
|
|
314
268
|
if (continueNeeded) {
|
|
315
269
|
// If no more capturing, continue transcribing until all slices are transcribed
|
|
316
|
-
fullTranscribeSamples(
|
|
270
|
+
fullTranscribeSamples(true);
|
|
317
271
|
} else if (isStopped) {
|
|
318
272
|
// No next, cleanup
|
|
319
273
|
rewind();
|
|
@@ -383,62 +337,30 @@ public class WhisperContext {
|
|
|
383
337
|
this.jobId = jobId;
|
|
384
338
|
isTranscribing = true;
|
|
385
339
|
float[] audioData = AudioUtils.decodeWaveFile(inputStream);
|
|
386
|
-
int code = full(jobId, options, audioData, audioData.length);
|
|
387
|
-
isTranscribing = false;
|
|
388
|
-
this.jobId = -1;
|
|
389
|
-
if (code != 0 && code != 999) {
|
|
390
|
-
throw new Exception("Failed to transcribe the file. Code: " + code);
|
|
391
|
-
}
|
|
392
|
-
WritableMap result = getTextSegments(0, getTextSegmentCount(context));
|
|
393
|
-
result.putBoolean("isAborted", isStoppedByAction);
|
|
394
|
-
return result;
|
|
395
|
-
}
|
|
396
340
|
|
|
397
|
-
private int full(int jobId, ReadableMap options, float[] audioData, int audioDataLen) {
|
|
398
341
|
boolean hasProgressCallback = options.hasKey("onProgress") && options.getBoolean("onProgress");
|
|
399
342
|
boolean hasNewSegmentsCallback = options.hasKey("onNewSegments") && options.getBoolean("onNewSegments");
|
|
400
|
-
|
|
343
|
+
int code = fullWithNewJob(
|
|
401
344
|
jobId,
|
|
402
345
|
context,
|
|
403
346
|
// float[] audio_data,
|
|
404
347
|
audioData,
|
|
405
348
|
// jint audio_data_len,
|
|
406
|
-
|
|
407
|
-
//
|
|
408
|
-
options
|
|
409
|
-
// jint max_context,
|
|
410
|
-
options.hasKey("maxContext") ? options.getInt("maxContext") : -1,
|
|
411
|
-
|
|
412
|
-
// jint word_thold,
|
|
413
|
-
options.hasKey("wordThold") ? options.getInt("wordThold") : -1,
|
|
414
|
-
// jint max_len,
|
|
415
|
-
options.hasKey("maxLen") ? options.getInt("maxLen") : -1,
|
|
416
|
-
// jboolean token_timestamps,
|
|
417
|
-
options.hasKey("tokenTimestamps") ? options.getBoolean("tokenTimestamps") : false,
|
|
418
|
-
|
|
419
|
-
// jint offset,
|
|
420
|
-
options.hasKey("offset") ? options.getInt("offset") : -1,
|
|
421
|
-
// jint duration,
|
|
422
|
-
options.hasKey("duration") ? options.getInt("duration") : -1,
|
|
423
|
-
// jfloat temperature,
|
|
424
|
-
options.hasKey("temperature") ? (float) options.getDouble("temperature") : -1.0f,
|
|
425
|
-
// jfloat temperature_inc,
|
|
426
|
-
options.hasKey("temperatureInc") ? (float) options.getDouble("temperatureInc") : -1.0f,
|
|
427
|
-
// jint beam_size,
|
|
428
|
-
options.hasKey("beamSize") ? options.getInt("beamSize") : -1,
|
|
429
|
-
// jint best_of,
|
|
430
|
-
options.hasKey("bestOf") ? options.getInt("bestOf") : -1,
|
|
431
|
-
// jboolean speed_up,
|
|
432
|
-
options.hasKey("speedUp") ? options.getBoolean("speedUp") : false,
|
|
433
|
-
// jboolean translate,
|
|
434
|
-
options.hasKey("translate") ? options.getBoolean("translate") : false,
|
|
435
|
-
// jstring language,
|
|
436
|
-
options.hasKey("language") ? options.getString("language") : "auto",
|
|
437
|
-
// jstring prompt
|
|
438
|
-
options.hasKey("prompt") ? options.getString("prompt") : null,
|
|
349
|
+
audioData.length,
|
|
350
|
+
// ReadableMap options,
|
|
351
|
+
options,
|
|
439
352
|
// Callback callback
|
|
440
353
|
hasProgressCallback || hasNewSegmentsCallback ? new Callback(this, hasProgressCallback, hasNewSegmentsCallback) : null
|
|
441
354
|
);
|
|
355
|
+
|
|
356
|
+
isTranscribing = false;
|
|
357
|
+
this.jobId = -1;
|
|
358
|
+
if (code != 0 && code != 999) {
|
|
359
|
+
throw new Exception("Failed to transcribe the file. Code: " + code);
|
|
360
|
+
}
|
|
361
|
+
WritableMap result = getTextSegments(0, getTextSegmentCount(context));
|
|
362
|
+
result.putBoolean("isAborted", isStoppedByAction);
|
|
363
|
+
return result;
|
|
442
364
|
}
|
|
443
365
|
|
|
444
366
|
private WritableMap getTextSegments(int start, int count) {
|
|
@@ -557,31 +479,18 @@ public class WhisperContext {
|
|
|
557
479
|
}
|
|
558
480
|
}
|
|
559
481
|
|
|
560
|
-
|
|
482
|
+
// JNI methods
|
|
561
483
|
protected static native long initContext(String modelPath);
|
|
562
484
|
protected static native long initContextWithAsset(AssetManager assetManager, String modelPath);
|
|
563
485
|
protected static native long initContextWithInputStream(PushbackInputStream inputStream);
|
|
564
|
-
protected static native
|
|
565
|
-
|
|
486
|
+
protected static native void freeContext(long contextPtr);
|
|
487
|
+
|
|
488
|
+
protected static native int fullWithNewJob(
|
|
566
489
|
int job_id,
|
|
567
490
|
long context,
|
|
568
491
|
float[] audio_data,
|
|
569
492
|
int audio_data_len,
|
|
570
|
-
|
|
571
|
-
int max_context,
|
|
572
|
-
int word_thold,
|
|
573
|
-
int max_len,
|
|
574
|
-
boolean token_timestamps,
|
|
575
|
-
int offset,
|
|
576
|
-
int duration,
|
|
577
|
-
float temperature,
|
|
578
|
-
float temperature_inc,
|
|
579
|
-
int beam_size,
|
|
580
|
-
int best_of,
|
|
581
|
-
boolean speed_up,
|
|
582
|
-
boolean translate,
|
|
583
|
-
String language,
|
|
584
|
-
String prompt,
|
|
493
|
+
ReadableMap options,
|
|
585
494
|
Callback Callback
|
|
586
495
|
);
|
|
587
496
|
protected static native void abortTranscribe(int jobId);
|
|
@@ -590,5 +499,19 @@ public class WhisperContext {
|
|
|
590
499
|
protected static native String getTextSegment(long context, int index);
|
|
591
500
|
protected static native int getTextSegmentT0(long context, int index);
|
|
592
501
|
protected static native int getTextSegmentT1(long context, int index);
|
|
593
|
-
|
|
502
|
+
|
|
503
|
+
protected static native void createRealtimeTranscribeJob(
|
|
504
|
+
int job_id,
|
|
505
|
+
long context,
|
|
506
|
+
ReadableMap options
|
|
507
|
+
);
|
|
508
|
+
protected static native void finishRealtimeTranscribeJob(int job_id, long context, int[] sliceNSamples);
|
|
509
|
+
protected static native boolean vadSimple(int job_id, int slice_index, int n_samples, int n);
|
|
510
|
+
protected static native void putPcmData(int job_id, short[] buffer, int slice_index, int n_samples, int n);
|
|
511
|
+
protected static native int fullWithJob(
|
|
512
|
+
int job_id,
|
|
513
|
+
long context,
|
|
514
|
+
int slice_index,
|
|
515
|
+
int n_samples
|
|
516
|
+
);
|
|
594
517
|
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#include <jni.h>
|
|
2
|
+
|
|
3
|
+
// ReadableMap utils
|
|
4
|
+
|
|
5
|
+
namespace readablemap {
|
|
6
|
+
|
|
7
|
+
bool hasKey(JNIEnv *env, jobject readableMap, const char *key) {
|
|
8
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
9
|
+
jmethodID hasKeyMethod = env->GetMethodID(mapClass, "hasKey", "(Ljava/lang/String;)Z");
|
|
10
|
+
jstring jKey = env->NewStringUTF(key);
|
|
11
|
+
jboolean result = env->CallBooleanMethod(readableMap, hasKeyMethod, jKey);
|
|
12
|
+
env->DeleteLocalRef(jKey);
|
|
13
|
+
return result;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
int getInt(JNIEnv *env, jobject readableMap, const char *key, jint defaultValue) {
|
|
17
|
+
if (!hasKey(env, readableMap, key)) {
|
|
18
|
+
return defaultValue;
|
|
19
|
+
}
|
|
20
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
21
|
+
jmethodID getIntMethod = env->GetMethodID(mapClass, "getInt", "(Ljava/lang/String;)I");
|
|
22
|
+
jstring jKey = env->NewStringUTF(key);
|
|
23
|
+
jint result = env->CallIntMethod(readableMap, getIntMethod, jKey);
|
|
24
|
+
env->DeleteLocalRef(jKey);
|
|
25
|
+
return result;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
bool getBool(JNIEnv *env, jobject readableMap, const char *key, jboolean defaultValue) {
|
|
29
|
+
if (!hasKey(env, readableMap, key)) {
|
|
30
|
+
return defaultValue;
|
|
31
|
+
}
|
|
32
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
33
|
+
jmethodID getBoolMethod = env->GetMethodID(mapClass, "getBoolean", "(Ljava/lang/String;)Z");
|
|
34
|
+
jstring jKey = env->NewStringUTF(key);
|
|
35
|
+
jboolean result = env->CallBooleanMethod(readableMap, getBoolMethod, jKey);
|
|
36
|
+
env->DeleteLocalRef(jKey);
|
|
37
|
+
return result;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
long getLong(JNIEnv *env, jobject readableMap, const char *key, jlong defaultValue) {
|
|
41
|
+
if (!hasKey(env, readableMap, key)) {
|
|
42
|
+
return defaultValue;
|
|
43
|
+
}
|
|
44
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
45
|
+
jmethodID getLongMethod = env->GetMethodID(mapClass, "getLong", "(Ljava/lang/String;)J");
|
|
46
|
+
jstring jKey = env->NewStringUTF(key);
|
|
47
|
+
jlong result = env->CallLongMethod(readableMap, getLongMethod, jKey);
|
|
48
|
+
env->DeleteLocalRef(jKey);
|
|
49
|
+
return result;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
float getFloat(JNIEnv *env, jobject readableMap, const char *key, jfloat defaultValue) {
|
|
53
|
+
if (!hasKey(env, readableMap, key)) {
|
|
54
|
+
return defaultValue;
|
|
55
|
+
}
|
|
56
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
57
|
+
jmethodID getFloatMethod = env->GetMethodID(mapClass, "getDouble", "(Ljava/lang/String;)D");
|
|
58
|
+
jstring jKey = env->NewStringUTF(key);
|
|
59
|
+
jfloat result = env->CallDoubleMethod(readableMap, getFloatMethod, jKey);
|
|
60
|
+
env->DeleteLocalRef(jKey);
|
|
61
|
+
return result;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
jstring getString(JNIEnv *env, jobject readableMap, const char *key, jstring defaultValue) {
|
|
65
|
+
if (!hasKey(env, readableMap, key)) {
|
|
66
|
+
return defaultValue;
|
|
67
|
+
}
|
|
68
|
+
jclass mapClass = env->GetObjectClass(readableMap);
|
|
69
|
+
jmethodID getStringMethod = env->GetMethodID(mapClass, "getString", "(Ljava/lang/String;)Ljava/lang/String;");
|
|
70
|
+
jstring jKey = env->NewStringUTF(key);
|
|
71
|
+
jstring result = (jstring) env->CallObjectMethod(readableMap, getStringMethod, jKey);
|
|
72
|
+
env->DeleteLocalRef(jKey);
|
|
73
|
+
return result;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
}
|