whisper.rn 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -20,6 +20,23 @@ npm install whisper.rn
20
20
 
21
21
  Then re-run `npx pod-install` again for iOS.
22
22
 
23
+ ## Add Microphone Permissions (Optional)
24
+
25
+ If you want to use realtime transcribe, you need to add the microphone permission to your app.
26
+
27
+ ### iOS
28
+ Add these lines to ```ios/[YOU_APP_NAME]/info.plist```
29
+ ```xml
30
+ <key>NSMicrophoneUsageDescription</key>
31
+ <string>This app requires microphone access in order to transcribe speech</string>
32
+ ```
33
+
34
+ ### Android
35
+ Add the following line to ```android/app/src/main/AndroidManifest.xml```
36
+ ```xml
37
+ <uses-permission android:name="android.permission.RECORD_AUDIO" />
38
+ ```
39
+
23
40
  ## Usage
24
41
 
25
42
  ```js
@@ -30,13 +47,35 @@ const sampleFilePath = 'file://.../sample.wav'
30
47
 
31
48
  const whisperContext = await initWhisper({ filePath })
32
49
 
33
- const { result } = await whisperContext.transcribe(sampleFilePath, {
34
- language: 'en',
35
- // More options
36
- })
50
+ const options = { language: 'en' }
51
+ const { stop, promise } = whisperContext.transcribe(sampleFilePath, options)
52
+
53
+ const { result } = await promise
37
54
  // result: (The inference text result from audio file)
38
55
  ```
39
56
 
57
+ Use realtime transcribe:
58
+
59
+ ```js
60
+ const { stop, subscribe } = whisperContext.transcribeRealtime(options)
61
+
62
+ subscribe(evt => {
63
+ const { isCapturing, data, processTime, recordingTime } = evt
64
+ console.log(
65
+ `Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
66
+ // The inference text result from audio record:
67
+ `Result: ${data.result}\n\n` +
68
+ `Process time: ${processTime}ms\n` +
69
+ `Recording time: ${recordingTime}ms`,
70
+ )
71
+ if (!isCapturing) console.log('Finished realtime transcribing')
72
+ })
73
+ ```
74
+
75
+ In Android, you may need to request the microphone permission by [`PermissionAndroid`](https://reactnative.dev/docs/permissionsandroid).
76
+
77
+ The documentation is not ready yet, please see the comments of [index](./src/index.tsx) file for more details at the moment.
78
+
40
79
  ## Run with example
41
80
 
42
81
  The example app is using [react-native-fs](https://github.com/itinance/react-native-fs) to download the model file and audio file.
@@ -40,10 +40,8 @@ android {
40
40
  buildConfigField "boolean", "IS_NEW_ARCHITECTURE_ENABLED", isNewArchitectureEnabled().toString()
41
41
  }
42
42
  externalNativeBuild {
43
- externalNativeBuild {
44
- ndkBuild {
45
- path 'src/main/jni/whisper/Android.mk'
46
- }
43
+ ndkBuild {
44
+ path 'src/main/jni/whisper/Android.mk'
47
45
  }
48
46
  }
49
47
  buildTypes {
@@ -5,6 +5,7 @@ import android.util.Log;
5
5
  import android.os.Build;
6
6
  import android.os.Handler;
7
7
  import android.os.AsyncTask;
8
+ import android.media.AudioRecord;
8
9
 
9
10
  import com.facebook.react.bridge.Promise;
10
11
  import com.facebook.react.bridge.ReactApplicationContext;
@@ -51,7 +52,7 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
51
52
  throw new Exception("Failed to initialize context");
52
53
  }
53
54
  int id = Math.abs(new Random().nextInt());
54
- WhisperContext whisperContext = new WhisperContext(context);
55
+ WhisperContext whisperContext = new WhisperContext(id, reactContext, context);
55
56
  contexts.put(id, whisperContext);
56
57
  return id;
57
58
  } catch (Exception e) {
@@ -72,18 +73,27 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
72
73
  }
73
74
 
74
75
  @ReactMethod
75
- public void transcribe(int id, String filePath, ReadableMap options, Promise promise) {
76
+ public void transcribeFile(int id, int jobId, String filePath, ReadableMap options, Promise promise) {
77
+ final WhisperContext context = contexts.get(id);
78
+ if (context == null) {
79
+ promise.reject("Context not found");
80
+ return;
81
+ }
82
+ if (context.isCapturing()) {
83
+ promise.reject("The context is in realtime transcribe mode");
84
+ return;
85
+ }
86
+ if (context.isTranscribing()) {
87
+ promise.reject("Context is already transcribing");
88
+ return;
89
+ }
76
90
  new AsyncTask<Void, Void, WritableMap>() {
77
91
  private Exception exception;
78
92
 
79
93
  @Override
80
94
  protected WritableMap doInBackground(Void... voids) {
81
95
  try {
82
- WhisperContext context = contexts.get(id);
83
- if (context == null) {
84
- throw new Exception("Context " + id + " not found");
85
- }
86
- return context.transcribe(filePath, options);
96
+ return context.transcribeFile(jobId, filePath, options);
87
97
  } catch (Exception e) {
88
98
  exception = e;
89
99
  return null;
@@ -101,6 +111,35 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
101
111
  }.execute();
102
112
  }
103
113
 
114
+ @ReactMethod
115
+ public void startRealtimeTranscribe(int id, int jobId, ReadableMap options, Promise promise) {
116
+ final WhisperContext context = contexts.get(id);
117
+ if (context == null) {
118
+ promise.reject("Context not found");
119
+ return;
120
+ }
121
+ if (context.isCapturing()) {
122
+ promise.reject("Context is already in capturing");
123
+ return;
124
+ }
125
+ int state = context.startRealtimeTranscribe(jobId, options);
126
+ if (state == AudioRecord.STATE_INITIALIZED) {
127
+ promise.resolve(null);
128
+ return;
129
+ }
130
+ promise.reject("Failed to start realtime transcribe. State: " + state);
131
+ }
132
+
133
+ @ReactMethod
134
+ public void abortTranscribe(int contextId, int jobId, Promise promise) {
135
+ WhisperContext context = contexts.get(contextId);
136
+ if (context == null) {
137
+ promise.reject("Context not found");
138
+ return;
139
+ }
140
+ context.stopTranscribe(jobId);
141
+ }
142
+
104
143
  @ReactMethod
105
144
  public void releaseContext(int id, Promise promise) {
106
145
  new AsyncTask<Void, Void, Void>() {
@@ -168,6 +207,7 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
168
207
 
169
208
  @Override
170
209
  public void onHostDestroy() {
210
+ WhisperContext.abortAllTranscribe();
171
211
  for (WhisperContext context : contexts.values()) {
172
212
  context.release();
173
213
  }
@@ -4,10 +4,15 @@ import com.facebook.react.bridge.Arguments;
4
4
  import com.facebook.react.bridge.WritableArray;
5
5
  import com.facebook.react.bridge.WritableMap;
6
6
  import com.facebook.react.bridge.ReadableMap;
7
+ import com.facebook.react.bridge.ReactApplicationContext;
8
+ import com.facebook.react.modules.core.DeviceEventManagerModule;
7
9
 
8
10
  import android.util.Log;
9
11
  import android.os.Build;
10
12
  import android.content.res.AssetManager;
13
+ import android.media.AudioFormat;
14
+ import android.media.AudioRecord;
15
+ import android.media.MediaRecorder.AudioSource;
11
16
 
12
17
  import java.util.Random;
13
18
  import java.lang.StringBuilder;
@@ -26,16 +31,175 @@ import java.nio.ShortBuffer;
26
31
 
27
32
  public class WhisperContext {
28
33
  public static final String NAME = "RNWhisperContext";
34
+
35
+ private static final int SAMPLE_RATE = 16000;
36
+ private static final int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
37
+ private static final int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
38
+ private static final int AUDIO_SOURCE = AudioSource.VOICE_RECOGNITION;
39
+ private static final int DEFAULT_MAX_AUDIO_SEC = 30;
40
+
41
+ private int id;
42
+ private ReactApplicationContext reactContext;
29
43
  private long context;
30
44
 
31
- public WhisperContext(long context) {
45
+ private DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
46
+
47
+ private int jobId = -1;
48
+ private AudioRecord recorder = null;
49
+ private int bufferSize;
50
+ private short[] buffer16;
51
+ private int nSamples = 0;
52
+ private boolean isCapturing = false;
53
+ private boolean isTranscribing = false;
54
+ private boolean isRealtime = false;
55
+
56
+ public WhisperContext(int id, ReactApplicationContext reactContext, long context) {
57
+ this.id = id;
32
58
  this.context = context;
59
+ this.reactContext = reactContext;
60
+ eventEmitter = reactContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class);
61
+ bufferSize = AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT);
62
+ }
63
+
64
+ public int startRealtimeTranscribe(int jobId, ReadableMap options) {
65
+ if (isCapturing || isTranscribing) {
66
+ return -100;
67
+ }
68
+
69
+ recorder = new AudioRecord(AUDIO_SOURCE, SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT, bufferSize);
70
+
71
+ int state = recorder.getState();
72
+ if (state != AudioRecord.STATE_INITIALIZED) {
73
+ recorder.release();
74
+ return state;
75
+ }
76
+
77
+ int realtimeAudioSec = options.hasKey("realtimeAudioSec") ? options.getInt("realtimeAudioSec") : 0;
78
+ final int maxAudioSec = realtimeAudioSec > 0 ? realtimeAudioSec : DEFAULT_MAX_AUDIO_SEC;
79
+
80
+ buffer16 = new short[maxAudioSec * SAMPLE_RATE * Short.BYTES];
81
+
82
+ this.jobId = jobId;
83
+ isCapturing = true;
84
+ isRealtime = true;
85
+ nSamples = 0;
86
+
87
+ recorder.startRecording();
88
+
89
+ new Thread(new Runnable() {
90
+ @Override
91
+ public void run() {
92
+ try {
93
+ short[] buffer = new short[bufferSize];
94
+ Thread fullHandler = null;
95
+ while (isCapturing) {
96
+ try {
97
+ int n = recorder.read(buffer, 0, bufferSize);
98
+ if (n == 0) continue;
99
+
100
+ if (nSamples + n > maxAudioSec * SAMPLE_RATE) {
101
+ // Full, ignore data
102
+ isCapturing = false;
103
+ if (!isTranscribing)
104
+ emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
105
+ break;
106
+ }
107
+ nSamples += n;
108
+ for (int i = 0; i < n; i++) {
109
+ buffer16[nSamples + i] = buffer[i];
110
+ }
111
+ if (!isTranscribing && nSamples > SAMPLE_RATE / 2) {
112
+ isTranscribing = true;
113
+ Log.d(NAME, "Start transcribing realtime: " + nSamples);
114
+ fullHandler = new Thread(new Runnable() {
115
+ @Override
116
+ public void run() {
117
+ if (!isCapturing) return;
118
+
119
+ // convert I16 to F32
120
+ float[] nSamplesBuffer32 = new float[nSamples];
121
+ for (int i = 0; i < nSamples; i++) {
122
+ nSamplesBuffer32[i] = buffer16[i] / 32768.0f;
123
+ }
124
+
125
+ int timeStart = (int) System.currentTimeMillis();
126
+ int code = full(jobId, options, nSamplesBuffer32, nSamples);
127
+ int timeEnd = (int) System.currentTimeMillis();
128
+ int timeRecording = (int) (nSamples / SAMPLE_RATE * 1000);
129
+
130
+ WritableMap payload = Arguments.createMap();
131
+ payload.putBoolean("isCapturing", isCapturing);
132
+ payload.putInt("code", code);
133
+ payload.putInt("processTime", timeEnd - timeStart);
134
+ payload.putInt("recordingTime", timeRecording);
135
+
136
+ if (code == 0) {
137
+ payload.putMap("data", getTextSegments());
138
+ emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
139
+ } else {
140
+ payload.putString("error", "Transcribe failed with code " + code);
141
+ emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
142
+ }
143
+
144
+ if (!isCapturing) {
145
+ emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
146
+ }
147
+ isTranscribing = false;
148
+ }
149
+ });
150
+ fullHandler.start();
151
+ }
152
+ } catch (Exception e) {
153
+ Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
154
+ }
155
+ }
156
+ if (fullHandler != null) {
157
+ fullHandler.join(); // Wait for full transcribe to finish
158
+ }
159
+ recorder.stop();
160
+ } catch (Exception e) {
161
+ e.printStackTrace();
162
+ } finally {
163
+ recorder.release();
164
+ recorder = null;
165
+ }
166
+ }
167
+ }).start();
168
+
169
+ return state;
170
+ }
171
+
172
+ private void emitTranscribeEvent(final String eventName, final WritableMap payload) {
173
+ WritableMap event = Arguments.createMap();
174
+ event.putInt("contextId", WhisperContext.this.id);
175
+ event.putInt("jobId", jobId);
176
+ event.putMap("payload", payload);
177
+ eventEmitter.emit(eventName, event);
33
178
  }
34
179
 
35
- public WritableMap transcribe(final String filePath, final ReadableMap options) throws IOException, Exception {
36
- int code = fullTranscribe(
180
+ public WritableMap transcribeFile(int jobId, String filePath, ReadableMap options) throws IOException, Exception {
181
+ this.jobId = jobId;
182
+ isTranscribing = true;
183
+ float[] audioData = decodeWaveFile(new File(filePath));
184
+ int code = full(jobId, options, audioData, audioData.length);
185
+ isTranscribing = false;
186
+ this.jobId = -1;
187
+ if (code != 0) {
188
+ throw new Exception("Failed to transcribe the file. Code: " + code);
189
+ }
190
+ return getTextSegments();
191
+ }
192
+
193
+ private int full(int jobId, ReadableMap options, float[] audioData, int audioDataLen) {
194
+ return fullTranscribe(
195
+ jobId,
37
196
  context,
38
- decodeWaveFile(new File(filePath)),
197
+ // jboolean realtime,
198
+ isRealtime,
199
+ // float[] audio_data,
200
+ audioData,
201
+ // jint audio_data_len,
202
+ audioDataLen,
39
203
  // jint n_threads,
40
204
  options.hasKey("maxThreads") ? options.getInt("maxThreads") : -1,
41
205
  // jint max_context,
@@ -69,9 +233,9 @@ public class WhisperContext {
69
233
  // jstring prompt
70
234
  options.hasKey("prompt") ? options.getString("prompt") : null
71
235
  );
72
- if (code != 0) {
73
- throw new Exception("Transcription failed with code " + code);
74
- }
236
+ }
237
+
238
+ private WritableMap getTextSegments() {
75
239
  Integer count = getTextSegmentCount(context);
76
240
  StringBuilder builder = new StringBuilder();
77
241
 
@@ -92,7 +256,27 @@ public class WhisperContext {
92
256
  return data;
93
257
  }
94
258
 
259
+
260
+ public boolean isCapturing() {
261
+ return isCapturing;
262
+ }
263
+
264
+ public boolean isTranscribing() {
265
+ return isTranscribing;
266
+ }
267
+
268
+ public void stopTranscribe(int jobId) {
269
+ abortTranscribe(jobId);
270
+ isCapturing = false;
271
+ isTranscribing = false;
272
+ }
273
+
274
+ public void stopCurrentTranscribe() {
275
+ stopTranscribe(this.jobId);
276
+ }
277
+
95
278
  public void release() {
279
+ stopCurrentTranscribe();
96
280
  freeContext(context);
97
281
  }
98
282
 
@@ -185,8 +369,11 @@ public class WhisperContext {
185
369
 
186
370
  protected static native long initContext(String modelPath);
187
371
  protected static native int fullTranscribe(
372
+ int job_id,
188
373
  long context,
374
+ boolean realtime,
189
375
  float[] audio_data,
376
+ int audio_data_len,
190
377
  int n_threads,
191
378
  int max_context,
192
379
  int word_thold,
@@ -203,6 +390,8 @@ public class WhisperContext {
203
390
  String language,
204
391
  String prompt
205
392
  );
393
+ protected static native void abortTranscribe(int jobId);
394
+ protected static native void abortAllTranscribe();
206
395
  protected static native int getTextSegmentCount(long context);
207
396
  protected static native String getTextSegment(long context, int index);
208
397
  protected static native int getTextSegmentT0(long context, int index);
@@ -12,7 +12,7 @@ ifneq ($(APP_OPTIM),debug)
12
12
  endif
13
13
 
14
14
  LOCAL_CFLAGS += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
15
- LOCAL_CPPFLAGS += -std=c++11
15
+ LOCAL_CPPFLAGS += -std=c++11 -I $(WHISPER_LIB_DIR)
16
16
  LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
17
17
  $(WHISPER_LIB_DIR)/whisper.cpp \
18
18
  $(WHISPER_LIB_DIR)/rn-whisper.cpp \
@@ -36,8 +36,11 @@ JNIEXPORT jint JNICALL
36
36
  Java_com_rnwhisper_WhisperContext_fullTranscribe(
37
37
  JNIEnv *env,
38
38
  jobject thiz,
39
+ jint job_id,
39
40
  jlong context_ptr,
41
+ jboolean realtime,
40
42
  jfloatArray audio_data,
43
+ jint audio_data_len,
41
44
  jint n_threads,
42
45
  jint max_context,
43
46
  int word_thold,
@@ -57,7 +60,6 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
57
60
  UNUSED(thiz);
58
61
  struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
59
62
  jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr);
60
- const jsize audio_data_length = env->GetArrayLength(audio_data);
61
63
 
62
64
  int max_threads = min(4, get_nprocs());
63
65
 
@@ -81,7 +83,7 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
81
83
  params.speed_up = speed_up;
82
84
  params.offset_ms = 0;
83
85
  params.no_context = true;
84
- params.single_segment = false;
86
+ params.single_segment = realtime;
85
87
 
86
88
  if (max_len > -1) {
87
89
  params.max_len = max_len;
@@ -117,19 +119,45 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
117
119
  );
118
120
  }
119
121
 
122
+ params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
123
+ bool is_aborted = *(bool*)user_data;
124
+ return !is_aborted;
125
+ };
126
+ params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(job_id);
127
+
120
128
  LOGI("About to reset timings");
121
129
  whisper_reset_timings(context);
122
130
 
123
131
  LOGI("About to run whisper_full");
124
- int code = whisper_full(context, params, audio_data_arr, audio_data_length);
132
+ int code = whisper_full(context, params, audio_data_arr, audio_data_len);
125
133
  if (code == 0) {
126
134
  // whisper_print_timings(context);
127
135
  }
128
136
  env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);
129
137
  env->ReleaseStringUTFChars(language, language_chars);
138
+ rn_whisper_remove_abort_map(job_id);
130
139
  return code;
131
140
  }
132
141
 
142
+ JNIEXPORT void JNICALL
143
+ Java_com_rnwhisper_WhisperContext_abortTranscribe(
144
+ JNIEnv *env,
145
+ jobject thiz,
146
+ jint job_id
147
+ ) {
148
+ UNUSED(thiz);
149
+ rn_whisper_abort_transcribe(job_id);
150
+ }
151
+
152
+ JNIEXPORT void JNICALL
153
+ Java_com_rnwhisper_WhisperContext_abortAllTranscribe(
154
+ JNIEnv *env,
155
+ jobject thiz
156
+ ) {
157
+ UNUSED(thiz);
158
+ rn_whisper_abort_all_transcribe();
159
+ }
160
+
133
161
  JNIEXPORT jint JNICALL
134
162
  Java_com_rnwhisper_WhisperContext_getTextSegmentCount(
135
163
  JNIEnv *env, jobject thiz, jlong context_ptr) {
@@ -176,4 +204,4 @@ Java_com_rnwhisper_WhisperContext_freeContext(
176
204
  whisper_free(context);
177
205
  }
178
206
 
179
- } // extern "C"
207
+ } // extern "C"
@@ -1,6 +1,7 @@
1
1
  #include <cstdio>
2
2
  #include <string>
3
3
  #include <vector>
4
+ #include <unordered_map>
4
5
  #include "whisper.h"
5
6
 
6
7
  extern "C" {
@@ -28,4 +29,29 @@ void rn_whisper_convert_prompt(
28
29
  }
29
30
  }
30
31
 
32
+ std::unordered_map<int, bool> abort_map;
33
+
34
+ bool* rn_whisper_assign_abort_map(int job_id) {
35
+ abort_map[job_id] = false;
36
+ return &abort_map[job_id];
37
+ }
38
+
39
+ void rn_whisper_remove_abort_map(int job_id) {
40
+ if (abort_map.find(job_id) != abort_map.end()) {
41
+ abort_map.erase(job_id);
42
+ }
43
+ }
44
+
45
+ void rn_whisper_abort_transcribe(int job_id) {
46
+ if (abort_map.find(job_id) != abort_map.end()) {
47
+ abort_map[job_id] = true;
48
+ }
49
+ }
50
+
51
+ void rn_whisper_abort_all_transcribe() {
52
+ for (auto it = abort_map.begin(); it != abort_map.end(); ++it) {
53
+ it->second = true;
54
+ }
55
+ }
56
+
31
57
  }
package/cpp/rn-whisper.h CHANGED
@@ -11,6 +11,11 @@ void rn_whisper_convert_prompt(
11
11
  std::string * prompt
12
12
  );
13
13
 
14
+ bool* rn_whisper_assign_abort_map(int job_id);
15
+ void rn_whisper_remove_abort_map(int job_id);
16
+ void rn_whisper_abort_transcribe(int job_id);
17
+ void rn_whisper_abort_all_transcribe();
18
+
14
19
  #ifdef __cplusplus
15
20
  }
16
21
  #endif
package/ios/RNWhisper.h CHANGED
@@ -3,9 +3,9 @@
3
3
  #import "rn-whisper.h"
4
4
  #endif
5
5
 
6
-
7
6
  #import <React/RCTBridgeModule.h>
7
+ #import <React/RCTEventEmitter.h>
8
8
 
9
- @interface RNWhisper : NSObject <RCTBridgeModule>
9
+ @interface RNWhisper : RCTEventEmitter <RCTBridgeModule>
10
10
 
11
11
  @end