whisper.rn 0.3.0-rc.5 → 0.3.0-rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -3,9 +3,14 @@ import {
3
3
  DeviceEventEmitter,
4
4
  Platform,
5
5
  DeviceEventEmitterStatic,
6
+ Image,
6
7
  } from 'react-native'
7
8
  import RNWhisper from './NativeRNWhisper'
8
- import type { TranscribeOptions, TranscribeResult } from './NativeRNWhisper'
9
+ import type {
10
+ TranscribeOptions,
11
+ TranscribeResult,
12
+ CoreMLAsset,
13
+ } from './NativeRNWhisper'
9
14
  import { version } from './version.json'
10
15
 
11
16
  let EventEmitter: NativeEventEmitter | DeviceEventEmitterStatic
@@ -24,11 +29,11 @@ const EVENT_ON_REALTIME_TRANSCRIBE_END = '@RNWhisper_onRealtimeTranscribeEnd'
24
29
 
25
30
  export type TranscribeRealtimeOptions = TranscribeOptions & {
26
31
  /**
27
- * Realtime record max duration in seconds.
32
+ * Realtime record max duration in seconds.
28
33
  * Due to the whisper.cpp hard constraint - processes the audio in chunks of 30 seconds,
29
34
  * the recommended value will be <= 30 seconds. (Default: 30)
30
35
  */
31
- realtimeAudioSec?: number,
36
+ realtimeAudioSec?: number
32
37
  /**
33
38
  * Optimize audio transcription performance by slicing audio samples when `realtimeAudioSec` > 30.
34
39
  * Set `realtimeAudioSliceSec` < 30 so performance improvements can be achieved in the Whisper hard constraint (processes the audio in chunks of 30 seconds).
@@ -38,42 +43,42 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
38
43
  }
39
44
 
40
45
  export type TranscribeRealtimeEvent = {
41
- contextId: number,
42
- jobId: number,
46
+ contextId: number
47
+ jobId: number
43
48
  /** Is capturing audio, when false, the event is the final result */
44
- isCapturing: boolean,
45
- isStoppedByAction?: boolean,
46
- code: number,
47
- data?: TranscribeResult,
48
- error?: string,
49
- processTime: number,
50
- recordingTime: number,
49
+ isCapturing: boolean
50
+ isStoppedByAction?: boolean
51
+ code: number
52
+ data?: TranscribeResult
53
+ error?: string
54
+ processTime: number
55
+ recordingTime: number
51
56
  slices?: Array<{
52
- code: number,
53
- error?: string,
54
- data?: TranscribeResult,
55
- processTime: number,
56
- recordingTime: number,
57
- }>,
57
+ code: number
58
+ error?: string
59
+ data?: TranscribeResult
60
+ processTime: number
61
+ recordingTime: number
62
+ }>
58
63
  }
59
64
 
60
65
  export type TranscribeRealtimeNativePayload = {
61
66
  /** Is capturing audio, when false, the event is the final result */
62
- isCapturing: boolean,
63
- isStoppedByAction?: boolean,
64
- code: number,
65
- processTime: number,
66
- recordingTime: number,
67
- isUseSlices: boolean,
68
- sliceIndex: number,
69
- data?: TranscribeResult,
70
- error?: string,
67
+ isCapturing: boolean
68
+ isStoppedByAction?: boolean
69
+ code: number
70
+ processTime: number
71
+ recordingTime: number
72
+ isUseSlices: boolean
73
+ sliceIndex: number
74
+ data?: TranscribeResult
75
+ error?: string
71
76
  }
72
77
 
73
78
  export type TranscribeRealtimeNativeEvent = {
74
- contextId: number,
75
- jobId: number,
76
- payload: TranscribeRealtimeNativePayload,
79
+ contextId: number
80
+ jobId: number
81
+ payload: TranscribeRealtimeNativePayload
77
82
  }
78
83
 
79
84
  export class WhisperContext {
@@ -84,12 +89,29 @@ export class WhisperContext {
84
89
  }
85
90
 
86
91
  /** Transcribe audio file */
87
- transcribe(path: string, options: TranscribeOptions = {}): {
92
+ transcribe(
93
+ filePath: string | number,
94
+ options: TranscribeOptions = {},
95
+ ): {
88
96
  /** Stop the transcribe */
89
- stop: () => void,
97
+ stop: () => void
90
98
  /** Transcribe result promise */
91
- promise: Promise<TranscribeResult>,
99
+ promise: Promise<TranscribeResult>
92
100
  } {
101
+ let path = ''
102
+ if (typeof filePath === 'number') {
103
+ try {
104
+ const source = Image.resolveAssetSource(filePath)
105
+ if (source) path = source.uri
106
+ } catch (e) {
107
+ throw new Error(`Invalid asset: ${filePath}`)
108
+ }
109
+ } else {
110
+ if (filePath.startsWith('http'))
111
+ throw new Error('Transcribe remote file is not supported, please download it first')
112
+ path = filePath
113
+ }
114
+ if (path.startsWith('file://')) path = path.slice(7)
93
115
  const jobId: number = Math.floor(Math.random() * 10000)
94
116
  return {
95
117
  stop: () => RNWhisper.abortTranscribe(this.id, jobId),
@@ -100,9 +122,9 @@ export class WhisperContext {
100
122
  /** Transcribe the microphone audio stream, the microphone user permission is required */
101
123
  async transcribeRealtime(options: TranscribeRealtimeOptions = {}): Promise<{
102
124
  /** Stop the realtime transcribe */
103
- stop: () => void,
125
+ stop: () => void
104
126
  /** Subscribe to realtime transcribe events */
105
- subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void,
127
+ subscribe: (callback: (event: TranscribeRealtimeEvent) => void) => void
106
128
  }> {
107
129
  const jobId: number = Math.floor(Math.random() * 10000)
108
130
  await RNWhisper.startRealtimeTranscribe(this.id, jobId, options)
@@ -118,37 +140,42 @@ export class WhisperContext {
118
140
  const { segments = [] } = slices[sliceIndex]?.data || {}
119
141
  tOffset = segments[segments.length - 1]?.t1 || 0
120
142
  }
121
- ({ sliceIndex } = payload)
143
+ ;({ sliceIndex } = payload)
122
144
  slices[sliceIndex] = {
123
145
  ...payload,
124
- data: payload.data ? {
125
- ...payload.data,
126
- segments: payload.data.segments.map((segment) => ({
127
- ...segment,
128
- t0: segment.t0 + tOffset,
129
- t1: segment.t1 + tOffset,
130
- })) || [],
131
- } : undefined,
146
+ data: payload.data
147
+ ? {
148
+ ...payload.data,
149
+ segments:
150
+ payload.data.segments.map((segment) => ({
151
+ ...segment,
152
+ t0: segment.t0 + tOffset,
153
+ t1: segment.t1 + tOffset,
154
+ })) || [],
155
+ }
156
+ : undefined,
132
157
  }
133
158
  }
134
159
 
135
- const mergeSlicesIfNeeded = (payload: TranscribeRealtimeNativePayload): TranscribeRealtimeNativePayload => {
160
+ const mergeSlicesIfNeeded = (
161
+ payload: TranscribeRealtimeNativePayload,
162
+ ): TranscribeRealtimeNativePayload => {
136
163
  if (!payload.isUseSlices) return payload
137
164
 
138
165
  const mergedPayload: any = {}
139
- slices.forEach(
140
- (slice) => {
141
- mergedPayload.data = {
142
- result: (mergedPayload.data?.result || '') + (slice.data?.result || ''),
143
- segments: [
144
- ...(mergedPayload?.data?.segments || []),
145
- ...(slice.data?.segments || []),
146
- ],
147
- }
148
- mergedPayload.processTime = slice.processTime
149
- mergedPayload.recordingTime = (mergedPayload?.recordingTime || 0) + slice.recordingTime
166
+ slices.forEach((slice) => {
167
+ mergedPayload.data = {
168
+ result:
169
+ (mergedPayload.data?.result || '') + (slice.data?.result || ''),
170
+ segments: [
171
+ ...(mergedPayload?.data?.segments || []),
172
+ ...(slice.data?.segments || []),
173
+ ],
150
174
  }
151
- )
175
+ mergedPayload.processTime = slice.processTime
176
+ mergedPayload.recordingTime =
177
+ (mergedPayload?.recordingTime || 0) + slice.recordingTime
178
+ })
152
179
  return { ...payload, ...mergedPayload, slices }
153
180
  }
154
181
 
@@ -167,7 +194,7 @@ export class WhisperContext {
167
194
  jobId: evt.jobId,
168
195
  ...mergeSlicesIfNeeded(payload),
169
196
  })
170
- }
197
+ },
171
198
  )
172
199
  let endListener: any = EventEmitter.addListener(
173
200
  EVENT_ON_REALTIME_TRANSCRIBE_END,
@@ -183,7 +210,7 @@ export class WhisperContext {
183
210
  contextId,
184
211
  jobId: evt.jobId,
185
212
  ...mergeSlicesIfNeeded(lastPayload),
186
- isCapturing: false
213
+ isCapturing: false,
187
214
  })
188
215
  if (transcribeListener) {
189
216
  transcribeListener.remove()
@@ -193,7 +220,7 @@ export class WhisperContext {
193
220
  endListener.remove()
194
221
  endListener = null
195
222
  }
196
- }
223
+ },
197
224
  )
198
225
  },
199
226
  }
@@ -204,10 +231,75 @@ export class WhisperContext {
204
231
  }
205
232
  }
206
233
 
207
- export async function initWhisper(
208
- { filePath, isBundleAsset }: { filePath: string; isBundleAsset?: boolean }
209
- ): Promise<WhisperContext> {
210
- const id = await RNWhisper.initContext(filePath, !!isBundleAsset)
234
+ export type ContextOptions = {
235
+ filePath: string | number
236
+ /**
237
+ * CoreML model assets, if you're using `require` on filePath,
238
+ * use this option is required if you want to enable Core ML,
239
+ * you will need bundle weights/weight.bin, model.mil, coremldata.bin into app by `require`
240
+ */
241
+ coreMLModelAsset?: {
242
+ filename: string
243
+ assets: number[]
244
+ }
245
+ /** Is the file path a bundle asset for pure string filePath */
246
+ isBundleAsset?: boolean
247
+ }
248
+
249
+ const coreMLModelAssetPaths = [
250
+ 'analytics/coremldata.bin',
251
+ 'weights/weight.bin',
252
+ 'model.mil',
253
+ 'coremldata.bin',
254
+ ]
255
+
256
+ export async function initWhisper({
257
+ filePath,
258
+ coreMLModelAsset,
259
+ isBundleAsset,
260
+ }: ContextOptions): Promise<WhisperContext> {
261
+ let path = ''
262
+ let coreMLAssets: CoreMLAsset[] | undefined
263
+ if (coreMLModelAsset) {
264
+ const { filename, assets } = coreMLModelAsset
265
+ if (filename && assets) {
266
+ coreMLAssets = assets
267
+ ?.map((asset) => {
268
+ const { uri } = Image.resolveAssetSource(asset)
269
+ const filepath = coreMLModelAssetPaths.find((p) => uri.includes(p))
270
+ if (filepath) {
271
+ return {
272
+ uri,
273
+ filepath: `${filename}/${filepath}`,
274
+ }
275
+ }
276
+ return undefined
277
+ })
278
+ .filter((asset): asset is CoreMLAsset => asset !== undefined)
279
+ }
280
+ }
281
+ if (typeof filePath === 'number') {
282
+ try {
283
+ const source = Image.resolveAssetSource(filePath)
284
+ if (source) {
285
+ path = source.uri
286
+ }
287
+ } catch (e) {
288
+ throw new Error(`Invalid asset: ${filePath}`)
289
+ }
290
+ } else {
291
+ if (!isBundleAsset && filePath.startsWith('http'))
292
+ throw new Error('Transcribe remote file is not supported, please download it first')
293
+ path = filePath
294
+ }
295
+ if (path.startsWith('file://')) path = path.slice(7)
296
+ const id = await RNWhisper.initContext({
297
+ filePath: path,
298
+ isBundleAsset: !!isBundleAsset,
299
+ // Only development mode need download Core ML model assets (from packager server)
300
+ downloadCoreMLAssets: __DEV__ && !!coreMLAssets,
301
+ coreMLAssets,
302
+ })
211
303
  return new WhisperContext(id)
212
304
  }
213
305