whisper.rn 0.3.0-rc.4 → 0.3.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ import type { TurboModule } from 'react-native/Libraries/TurboModule/RCTExport'
2
+ import { TurboModuleRegistry } from 'react-native'
3
+
4
+ export type TranscribeOptions = {
5
+ /** Spoken language (Default: 'auto' for auto-detect) */
6
+ language?: string,
7
+ /** Translate from source language to english (Default: false) */
8
+ translate?: boolean,
9
+ /** Number of threads to use during computation (Default: 2 for 4-core devices, 4 for more cores) */
10
+ maxThreads?: number,
11
+ /** Maximum number of text context tokens to store */
12
+ maxContext?: number,
13
+ /** Maximum segment length in characters */
14
+ maxLen?: number,
15
+ /** Enable token-level timestamps */
16
+ tokenTimestamps?: boolean,
17
+ /** Word timestamp probability threshold */
18
+ wordThold?: number,
19
+ /** Time offset in milliseconds */
20
+ offset?: number,
21
+ /** Duration of audio to process in milliseconds */
22
+ duration?: number,
23
+ /** Tnitial decoding temperature */
24
+ temperature?: number,
25
+ temperatureInc?: number,
26
+ /** Beam size for beam search */
27
+ beamSize?: number,
28
+ /** Number of best candidates to keep */
29
+ bestOf?: number,
30
+ /** Speed up audio by x2 (reduced accuracy) */
31
+ speedUp?: boolean,
32
+ /** Initial Prompt */
33
+ prompt?: string,
34
+ }
35
+
36
+ export type TranscribeResult = {
37
+ result: string,
38
+ segments: Array<{
39
+ text: string,
40
+ t0: number,
41
+ t1: number,
42
+ }>,
43
+ }
44
+
45
+ export type CoreMLAsset = {
46
+ uri: string,
47
+ filepath: string,
48
+ }
49
+
50
+ type NativeContextOptions = {
51
+ filePath: string,
52
+ isBundleAsset: boolean,
53
+ downloadCoreMLAssets?: boolean,
54
+ coreMLAssets?: CoreMLAsset[],
55
+ }
56
+
57
+ export interface Spec extends TurboModule {
58
+ getConstants(): {
59
+ useCoreML: boolean
60
+ coreMLAllowFallback: boolean
61
+ };
62
+ initContext(options: NativeContextOptions): Promise<number>;
63
+ releaseContext(contextId: number): Promise<void>;
64
+ releaseAllContexts(): Promise<void>;
65
+ transcribeFile(
66
+ contextId: number,
67
+ jobId: number,
68
+ path: string,
69
+ options: TranscribeOptions,
70
+ ): Promise<TranscribeResult>;
71
+ startRealtimeTranscribe(
72
+ contextId: number,
73
+ jobId: number,
74
+ options: TranscribeOptions,
75
+ ): Promise<void>;
76
+ abortTranscribe(contextId: number, jobId: number): Promise<void>;
77
+ }
78
+
79
+ export default TurboModuleRegistry.get<Spec>('RNWhisper') as Spec
package/src/index.ts CHANGED
@@ -1,77 +1,39 @@
1
1
  import {
2
2
  NativeEventEmitter,
3
3
  DeviceEventEmitter,
4
- NativeModules,
5
4
  Platform,
6
5
  DeviceEventEmitterStatic,
6
+ Image,
7
7
  } from 'react-native'
8
+ import RNWhisper from './NativeRNWhisper'
9
+ import type {
10
+ TranscribeOptions,
11
+ TranscribeResult,
12
+ CoreMLAsset,
13
+ } from './NativeRNWhisper'
8
14
  import { version } from './version.json'
9
15
 
10
- const LINKING_ERROR =
11
- `The package 'whisper.rn' doesn't seem to be linked. Make sure: \n\n${Platform.select({ ios: "- You have run 'pod install'\n", default: '' })
12
- }- You rebuilt the app after installing the package`
13
-
14
- const RNWhisper = NativeModules.RNWhisper
15
- ? NativeModules.RNWhisper
16
- : new Proxy(
17
- {},
18
- {
19
- get() {
20
- throw new Error(LINKING_ERROR)
21
- },
22
- },
23
- )
24
-
25
16
  let EventEmitter: NativeEventEmitter | DeviceEventEmitterStatic
26
17
  if (Platform.OS === 'ios') {
18
+ // @ts-ignore
27
19
  EventEmitter = new NativeEventEmitter(RNWhisper)
28
20
  }
29
21
  if (Platform.OS === 'android') {
30
22
  EventEmitter = DeviceEventEmitter
31
23
  }
32
24
 
25
+ export type { TranscribeOptions, TranscribeResult }
26
+
33
27
  const EVENT_ON_REALTIME_TRANSCRIBE = '@RNWhisper_onRealtimeTranscribe'
34
28
  const EVENT_ON_REALTIME_TRANSCRIBE_END = '@RNWhisper_onRealtimeTranscribeEnd'
35
29
 
36
- export type TranscribeOptions = {
37
- /** Spoken language (Default: 'auto' for auto-detect) */
38
- language?: string,
39
- /** Translate from source language to english (Default: false) */
40
- translate?: boolean,
41
- /** Number of threads to use during computation (Default: 2 for 4-core devices, 4 for more cores) */
42
- maxThreads?: number,
43
- /** Maximum number of text context tokens to store */
44
- maxContext?: number,
45
- /** Maximum segment length in characters */
46
- maxLen?: number,
47
- /** Enable token-level timestamps */
48
- tokenTimestamps?: boolean,
49
- /** Word timestamp probability threshold */
50
- wordThold?: number,
51
- /** Time offset in milliseconds */
52
- offset?: number,
53
- /** Duration of audio to process in milliseconds */
54
- duration?: number,
55
- /** Tnitial decoding temperature */
56
- temperature?: number,
57
- temperatureInc?: number,
58
- /** Beam size for beam search */
59
- beamSize?: number,
60
- /** Number of best candidates to keep */
61
- bestOf?: number,
62
- /** Speed up audio by x2 (reduced accuracy) */
63
- speedUp?: boolean,
64
- /** Initial Prompt */
65
- prompt?: string,
66
- }
67
-
68
30
  export type TranscribeRealtimeOptions = TranscribeOptions & {
69
31
  /**
70
- * Realtime record max duration in seconds.
32
+ * Realtime record max duration in seconds.
71
33
  * Due to the whisper.cpp hard constraint - processes the audio in chunks of 30 seconds,
72
34
  * the recommended value will be <= 30 seconds. (Default: 30)
73
35
  */
74
- realtimeAudioSec?: number,
36
+ realtimeAudioSec?: number
75
37
  /**
76
38
  * Optimize audio transcription performance by slicing audio samples when `realtimeAudioSec` > 30.
77
39
  * Set `realtimeAudioSliceSec` < 30 so performance improvements can be achieved in the Whisper hard constraint (processes the audio in chunks of 30 seconds).
@@ -80,52 +42,43 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
80
42
  realtimeAudioSliceSec?: number
81
43
  }
82
44
 
83
- export type TranscribeResult = {
84
- result: string,
85
- segments: Array<{
86
- text: string,
87
- t0: number,
88
- t1: number,
89
- }>,
90
- }
91
-
92
45
  export type TranscribeRealtimeEvent = {
93
- contextId: number,
94
- jobId: number,
46
+ contextId: number
47
+ jobId: number
95
48
  /** Is capturing audio, when false, the event is the final result */
96
- isCapturing: boolean,
97
- isStoppedByAction?: boolean,
98
- code: number,
99
- data?: TranscribeResult,
100
- error?: string,
101
- processTime: number,
102
- recordingTime: number,
49
+ isCapturing: boolean
50
+ isStoppedByAction?: boolean
51
+ code: number
52
+ data?: TranscribeResult
53
+ error?: string
54
+ processTime: number
55
+ recordingTime: number
103
56
  slices?: Array<{
104
- code: number,
105
- error?: string,
106
- data?: TranscribeResult,
107
- processTime: number,
108
- recordingTime: number,
109
- }>,
57
+ code: number
58
+ error?: string
59
+ data?: TranscribeResult
60
+ processTime: number
61
+ recordingTime: number
62
+ }>
110
63
  }
111
64
 
112
65
  export type TranscribeRealtimeNativePayload = {
113
66
  /** Is capturing audio, when false, the event is the final result */
114
- isCapturing: boolean,
115
- isStoppedByAction?: boolean,
116
- code: number,
117
- processTime: number,
118
- recordingTime: number,
119
- isUseSlices: boolean,
120
- sliceIndex: number,
121
- data?: TranscribeResult,
122
- error?: string,
67
+ isCapturing: boolean
68
+ isStoppedByAction?: boolean
69
+ code: number
70
+ processTime: number
71
+ recordingTime: number
72
+ isUseSlices: boolean
73
+ sliceIndex: number
74
+ data?: TranscribeResult
75
+ error?: string
123
76
  }
124
77
 
125
78
  export type TranscribeRealtimeNativeEvent = {
126
- contextId: number,
127
- jobId: number,
128
- payload: TranscribeRealtimeNativePayload,
79
+ contextId: number
80
+ jobId: number
81
+ payload: TranscribeRealtimeNativePayload
129
82
  }
130
83
 
131
84
  export class WhisperContext {
@@ -136,12 +89,29 @@ export class WhisperContext {
136
89
  }
137
90
 
138
91
  /** Transcribe audio file */
139
- transcribe(path: string, options: TranscribeOptions = {}): {
92
+ transcribe(
93
+ filePath: string | number,
94
+ options: TranscribeOptions = {},
95
+ ): {
140
96
  /** Stop the transcribe */
141
- stop: () => void,
97
+ stop: () => void
142
98
  /** Transcribe result promise */
143
- promise: Promise<TranscribeResult>,
99
+ promise: Promise<TranscribeResult>
144
100
  } {
101
+ let path = ''
102
+ if (typeof filePath === 'number') {
103
+ try {
104
+ const source = Image.resolveAssetSource(filePath)
105
+ if (source) path = source.uri
106
+ } catch (e) {
107
+ throw new Error(`Invalid asset: ${filePath}`)
108
+ }
109
+ } else {
110
+ if (filePath.startsWith('http'))
111
+ throw new Error('Transcribe remote file is not supported, please download it first')
112
+ path = filePath
113
+ }
114
+ if (path.startsWith('file://')) path = path.slice(7)
145
115
  const jobId: number = Math.floor(Math.random() * 10000)
146
116
  return {
147
117
  stop: () => RNWhisper.abortTranscribe(this.id, jobId),
@@ -152,9 +122,9 @@ export class WhisperContext {
152
122
  /** Transcribe the microphone audio stream, the microphone user permission is required */
153
123
  async transcribeRealtime(options: TranscribeRealtimeOptions = {}): Promise<{
154
124
  /** Stop the realtime transcribe */
155
- stop: () => void,
125
+ stop: () => void
156
126
  /** Subscribe to realtime transcribe events */
157
- subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void,
127
+ subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void
158
128
  }> {
159
129
  const jobId: number = Math.floor(Math.random() * 10000)
160
130
  await RNWhisper.startRealtimeTranscribe(this.id, jobId, options)
@@ -170,37 +140,42 @@ export class WhisperContext {
170
140
  const { segments = [] } = slices[sliceIndex]?.data || {}
171
141
  tOffset = segments[segments.length - 1]?.t1 || 0
172
142
  }
173
- ({ sliceIndex } = payload)
143
+ ;({ sliceIndex } = payload)
174
144
  slices[sliceIndex] = {
175
145
  ...payload,
176
- data: payload.data ? {
177
- ...payload.data,
178
- segments: payload.data.segments.map((segment) => ({
179
- ...segment,
180
- t0: segment.t0 + tOffset,
181
- t1: segment.t1 + tOffset,
182
- })) || [],
183
- } : undefined,
146
+ data: payload.data
147
+ ? {
148
+ ...payload.data,
149
+ segments:
150
+ payload.data.segments.map((segment) => ({
151
+ ...segment,
152
+ t0: segment.t0 + tOffset,
153
+ t1: segment.t1 + tOffset,
154
+ })) || [],
155
+ }
156
+ : undefined,
184
157
  }
185
158
  }
186
159
 
187
- const mergeSlicesIfNeeded = (payload: TranscribeRealtimeNativePayload): TranscribeRealtimeNativePayload => {
160
+ const mergeSlicesIfNeeded = (
161
+ payload: TranscribeRealtimeNativePayload,
162
+ ): TranscribeRealtimeNativePayload => {
188
163
  if (!payload.isUseSlices) return payload
189
164
 
190
165
  const mergedPayload: any = {}
191
- slices.forEach(
192
- (slice) => {
193
- mergedPayload.data = {
194
- result: (mergedPayload.data?.result || '') + (slice.data?.result || ''),
195
- segments: [
196
- ...(mergedPayload?.data?.segments || []),
197
- ...(slice.data?.segments || []),
198
- ],
199
- }
200
- mergedPayload.processTime = slice.processTime
201
- mergedPayload.recordingTime = (mergedPayload?.recordingTime || 0) + slice.recordingTime
166
+ slices.forEach((slice) => {
167
+ mergedPayload.data = {
168
+ result:
169
+ (mergedPayload.data?.result || '') + (slice.data?.result || ''),
170
+ segments: [
171
+ ...(mergedPayload?.data?.segments || []),
172
+ ...(slice.data?.segments || []),
173
+ ],
202
174
  }
203
- )
175
+ mergedPayload.processTime = slice.processTime
176
+ mergedPayload.recordingTime =
177
+ (mergedPayload?.recordingTime || 0) + slice.recordingTime
178
+ })
204
179
  return { ...payload, ...mergedPayload, slices }
205
180
  }
206
181
 
@@ -219,7 +194,7 @@ export class WhisperContext {
219
194
  jobId: evt.jobId,
220
195
  ...mergeSlicesIfNeeded(payload),
221
196
  })
222
- }
197
+ },
223
198
  )
224
199
  let endListener: any = EventEmitter.addListener(
225
200
  EVENT_ON_REALTIME_TRANSCRIBE_END,
@@ -235,7 +210,7 @@ export class WhisperContext {
235
210
  contextId,
236
211
  jobId: evt.jobId,
237
212
  ...mergeSlicesIfNeeded(lastPayload),
238
- isCapturing: false
213
+ isCapturing: false,
239
214
  })
240
215
  if (transcribeListener) {
241
216
  transcribeListener.remove()
@@ -245,7 +220,7 @@ export class WhisperContext {
245
220
  endListener.remove()
246
221
  endListener = null
247
222
  }
248
- }
223
+ },
249
224
  )
250
225
  },
251
226
  }
@@ -256,10 +231,75 @@ export class WhisperContext {
256
231
  }
257
232
  }
258
233
 
259
- export async function initWhisper(
260
- { filePath, isBundleAsset }: { filePath?: string, isBundleAsset?: boolean } = {}
261
- ): Promise<WhisperContext> {
262
- const id = await RNWhisper.initContext(filePath, !!isBundleAsset)
234
+ export type ContextOptions = {
235
+ filePath: string | number
236
+ /**
237
+ * CoreML model assets, if you're using `require` on filePath,
238
+ * use this option is required if you want to enable Core ML,
239
+ * you will need bundle weights/weight.bin, model.mil, coremldata.bin into app by `require`
240
+ */
241
+ coreMLModelAsset?: {
242
+ filename: string
243
+ assets: number[]
244
+ }
245
+ /** Is the file path a bundle asset for pure string filePath */
246
+ isBundleAsset?: boolean
247
+ }
248
+
249
+ const coreMLModelAssetPaths = [
250
+ 'analytics/coremldata.bin',
251
+ 'weights/weight.bin',
252
+ 'model.mil',
253
+ 'coremldata.bin',
254
+ ]
255
+
256
+ export async function initWhisper({
257
+ filePath,
258
+ coreMLModelAsset,
259
+ isBundleAsset,
260
+ }: ContextOptions): Promise<WhisperContext> {
261
+ let path = ''
262
+ let coreMLAssets: CoreMLAsset[] | undefined
263
+ if (coreMLModelAsset) {
264
+ const { filename, assets } = coreMLModelAsset
265
+ if (filename && assets) {
266
+ coreMLAssets = assets
267
+ ?.map((asset) => {
268
+ const { uri } = Image.resolveAssetSource(asset)
269
+ const filepath = coreMLModelAssetPaths.find((p) => uri.includes(p))
270
+ if (filepath) {
271
+ return {
272
+ uri,
273
+ filepath: `${filename}/${filepath}`,
274
+ }
275
+ }
276
+ return undefined
277
+ })
278
+ .filter((asset): asset is CoreMLAsset => asset !== undefined)
279
+ }
280
+ }
281
+ if (typeof filePath === 'number') {
282
+ try {
283
+ const source = Image.resolveAssetSource(filePath)
284
+ if (source) {
285
+ path = source.uri
286
+ }
287
+ } catch (e) {
288
+ throw new Error(`Invalid asset: ${filePath}`)
289
+ }
290
+ } else {
291
+ if (!isBundleAsset && filePath.startsWith('http'))
292
+ throw new Error('Transcribe remote file is not supported, please download it first')
293
+ path = filePath
294
+ }
295
+ if (path.startsWith('file://')) path = path.slice(7)
296
+ const id = await RNWhisper.initContext({
297
+ filePath: path,
298
+ isBundleAsset: !!isBundleAsset,
299
+ // Only development mode need download Core ML model assets (from packager server)
300
+ downloadCoreMLAssets: __DEV__ && !!coreMLAssets,
301
+ coreMLAssets,
302
+ })
263
303
  return new WhisperContext(id)
264
304
  }
265
305
 
@@ -270,8 +310,10 @@ export async function releaseAllWhisper(): Promise<void> {
270
310
  /** Current version of whisper.cpp */
271
311
  export const libVersion: string = version
272
312
 
313
+ const { useCoreML, coreMLAllowFallback } = RNWhisper.getConstants?.() || {}
314
+
273
315
  /** Is use CoreML models on iOS */
274
- export const isUseCoreML: boolean = !!RNWhisper.WHISPER_USE_COREML
316
+ export const isUseCoreML: boolean = !!useCoreML
275
317
 
276
318
  /** Is allow fallback to CPU if load CoreML model failed */
277
- export const isCoreMLAllowFallback: boolean = !!RNWhisper.WHISPER_COREML_ALLOW_FALLBACK
319
+ export const isCoreMLAllowFallback: boolean = !!coreMLAllowFallback