whisper.rn 0.5.0-rc.0 → 0.5.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +128 -50
  2. package/android/build.gradle +1 -0
  3. package/android/src/main/CMakeLists.txt +1 -0
  4. package/android/src/main/java/com/rnwhisper/RNWhisper.java +35 -0
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +33 -0
  6. package/android/src/main/jni.cpp +81 -0
  7. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  8. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  9. package/cpp/jsi/RNWhisperJSI.cpp +42 -6
  10. package/ios/RNWhisper.mm +11 -0
  11. package/ios/RNWhisperContext.h +1 -0
  12. package/ios/RNWhisperContext.mm +46 -0
  13. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  14. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  15. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  16. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  17. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  18. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  19. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  20. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  21. package/lib/commonjs/AudioSessionIos.js +2 -1
  22. package/lib/commonjs/AudioSessionIos.js.map +1 -1
  23. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  24. package/lib/commonjs/index.js +50 -10
  25. package/lib/commonjs/index.js.map +1 -1
  26. package/lib/commonjs/jest-mock.js +126 -0
  27. package/lib/commonjs/jest-mock.js.map +1 -0
  28. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +857 -0
  29. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  30. package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
  31. package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
  32. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
  33. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  34. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
  35. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  36. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
  37. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  38. package/lib/commonjs/realtime-transcription/index.js +27 -0
  39. package/lib/commonjs/realtime-transcription/index.js.map +1 -0
  40. package/lib/commonjs/realtime-transcription/types.js +114 -0
  41. package/lib/commonjs/realtime-transcription/types.js.map +1 -0
  42. package/lib/commonjs/utils/WavFileReader.js +158 -0
  43. package/lib/commonjs/utils/WavFileReader.js.map +1 -0
  44. package/lib/commonjs/utils/WavFileWriter.js +181 -0
  45. package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
  46. package/lib/commonjs/utils/common.js +25 -0
  47. package/lib/commonjs/utils/common.js.map +1 -0
  48. package/lib/module/AudioSessionIos.js +2 -1
  49. package/lib/module/AudioSessionIos.js.map +1 -1
  50. package/lib/module/NativeRNWhisper.js.map +1 -1
  51. package/lib/module/index.js +48 -10
  52. package/lib/module/index.js.map +1 -1
  53. package/lib/module/jest-mock.js +124 -0
  54. package/lib/module/jest-mock.js.map +1 -0
  55. package/lib/module/realtime-transcription/RealtimeTranscriber.js +851 -0
  56. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  57. package/lib/module/realtime-transcription/SliceManager.js +226 -0
  58. package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
  59. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
  60. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  61. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
  62. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  63. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
  64. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  65. package/lib/module/realtime-transcription/index.js +8 -0
  66. package/lib/module/realtime-transcription/index.js.map +1 -0
  67. package/lib/module/realtime-transcription/types.js +107 -0
  68. package/lib/module/realtime-transcription/types.js.map +1 -0
  69. package/lib/module/utils/WavFileReader.js +151 -0
  70. package/lib/module/utils/WavFileReader.js.map +1 -0
  71. package/lib/module/utils/WavFileWriter.js +174 -0
  72. package/lib/module/utils/WavFileWriter.js.map +1 -0
  73. package/lib/module/utils/common.js +18 -0
  74. package/lib/module/utils/common.js.map +1 -0
  75. package/lib/typescript/AudioSessionIos.d.ts +1 -1
  76. package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
  77. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  78. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  79. package/lib/typescript/index.d.ts +8 -4
  80. package/lib/typescript/index.d.ts.map +1 -1
  81. package/lib/typescript/jest-mock.d.ts +2 -0
  82. package/lib/typescript/jest-mock.d.ts.map +1 -0
  83. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +166 -0
  84. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
  85. package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
  86. package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
  87. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
  88. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
  89. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
  90. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
  91. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
  92. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
  93. package/lib/typescript/realtime-transcription/index.d.ts +6 -0
  94. package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
  95. package/lib/typescript/realtime-transcription/types.d.ts +222 -0
  96. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
  97. package/lib/typescript/utils/WavFileReader.d.ts +61 -0
  98. package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
  99. package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
  100. package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
  101. package/lib/typescript/utils/common.d.ts +9 -0
  102. package/lib/typescript/utils/common.d.ts.map +1 -0
  103. package/package.json +23 -11
  104. package/src/AudioSessionIos.ts +3 -2
  105. package/src/NativeRNWhisper.ts +2 -0
  106. package/src/index.ts +74 -22
  107. package/{jest/mock.js → src/jest-mock.ts} +2 -2
  108. package/src/realtime-transcription/RealtimeTranscriber.ts +1015 -0
  109. package/src/realtime-transcription/SliceManager.ts +252 -0
  110. package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
  111. package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
  112. package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
  113. package/src/realtime-transcription/index.ts +34 -0
  114. package/src/realtime-transcription/types.ts +283 -0
  115. package/src/utils/WavFileReader.ts +202 -0
  116. package/src/utils/WavFileWriter.ts +206 -0
  117. package/src/utils/common.ts +17 -0
@@ -0,0 +1,1015 @@
1
+ /* eslint-disable class-methods-use-this */
2
+ import type { VadOptions } from '../index'
3
+ import { SliceManager } from './SliceManager'
4
+ import { WavFileWriter, WavFileWriterFs } from '../utils/WavFileWriter'
5
+ import type {
6
+ RealtimeOptions,
7
+ RealtimeTranscribeEvent,
8
+ RealtimeVadEvent,
9
+ RealtimeTranscriberCallbacks,
10
+ RealtimeStatsEvent,
11
+ RealtimeTranscriberDependencies,
12
+ AudioStreamData,
13
+ AudioSliceNoData,
14
+ AudioStreamInterface,
15
+ AudioStreamConfig,
16
+ WhisperContextLike,
17
+ WhisperVadContextLike,
18
+ } from './types'
19
+ import { VAD_PRESETS } from './types'
20
+
21
+ /**
22
+ * RealtimeTranscriber provides real-time audio transcription with VAD support.
23
+ *
24
+ * Features:
25
+ * - Automatic slice management based on duration
26
+ * - VAD-based speech detection and auto-slicing
27
+ * - Configurable auto-slice mechanism that triggers on speech_end/silence events
28
+ * - Memory management for audio slices
29
+ * - Queue-based transcription processing
30
+ */
31
+ export class RealtimeTranscriber {
32
+ private whisperContext: WhisperContextLike
33
+
34
+ private vadContext?: WhisperVadContextLike
35
+
36
+ private audioStream: AudioStreamInterface
37
+
38
+ private fs?: WavFileWriterFs
39
+
40
+ private sliceManager: SliceManager
41
+
42
+ private callbacks: RealtimeTranscriberCallbacks = {}
43
+
44
+ private options: {
45
+ audioSliceSec: number
46
+ audioMinSec: number
47
+ maxSlicesInMemory: number
48
+ vadOptions: VadOptions
49
+ vadPreset?: keyof typeof VAD_PRESETS
50
+ autoSliceOnSpeechEnd: boolean
51
+ autoSliceThreshold: number
52
+ transcribeOptions: any
53
+ initialPrompt?: string
54
+ promptPreviousSlices: boolean
55
+ audioOutputPath?: string
56
+ audioStreamConfig?: AudioStreamConfig
57
+ logger: (message: string) => void
58
+ }
59
+
60
+ private isActive = false
61
+
62
+ private isTranscribing = false
63
+
64
+ private vadEnabled = false
65
+
66
+ private transcriptionQueue: Array<{
67
+ sliceIndex: number
68
+ audioData: Uint8Array
69
+ }> = []
70
+
71
+ private accumulatedData: Uint8Array = new Uint8Array(0)
72
+
73
+ private wavFileWriter: WavFileWriter | null = null
74
+
75
+ // Simplified VAD state management
76
+ private lastSpeechDetectedTime = 0
77
+
78
+ // Track VAD state for proper event transitions
79
+ private lastVadState: 'speech' | 'silence' = 'silence'
80
+
81
+ // Track last stats to emit only when changed
82
+ private lastStatsSnapshot: any = null
83
+
84
+ // Store transcription results by slice index
85
+ private transcriptionResults: Map<
86
+ number,
87
+ { slice: AudioSliceNoData; transcribeEvent: RealtimeTranscribeEvent }
88
+ > = new Map()
89
+
90
+ // Store VAD events by slice index for inclusion in transcribe events
91
+ private vadEvents: Map<number, RealtimeVadEvent> = new Map()
92
+
93
+ constructor(
94
+ dependencies: RealtimeTranscriberDependencies,
95
+ options: RealtimeOptions = {},
96
+ callbacks: RealtimeTranscriberCallbacks = {},
97
+ ) {
98
+ this.whisperContext = dependencies.whisperContext
99
+ this.vadContext = dependencies.vadContext
100
+ this.audioStream = dependencies.audioStream
101
+ this.fs = dependencies.fs
102
+ this.callbacks = callbacks
103
+
104
+ // Set default options with proper types
105
+ this.options = {
106
+ audioSliceSec: options.audioSliceSec || 30,
107
+ audioMinSec: options.audioMinSec || 1,
108
+ maxSlicesInMemory: options.maxSlicesInMemory || 3,
109
+ vadOptions: options.vadOptions || VAD_PRESETS.default,
110
+ vadPreset: options.vadPreset,
111
+ autoSliceOnSpeechEnd: options.autoSliceOnSpeechEnd || true,
112
+ autoSliceThreshold: options.autoSliceThreshold || 0.5,
113
+ transcribeOptions: options.transcribeOptions || {},
114
+ initialPrompt: options.initialPrompt,
115
+ promptPreviousSlices: options.promptPreviousSlices ?? true,
116
+ audioOutputPath: options.audioOutputPath,
117
+ logger: options.logger || (() => {}),
118
+ }
119
+
120
+ // Apply VAD preset if specified
121
+ if (this.options.vadPreset && VAD_PRESETS[this.options.vadPreset]) {
122
+ this.options.vadOptions = {
123
+ ...VAD_PRESETS[this.options.vadPreset],
124
+ ...this.options.vadOptions,
125
+ }
126
+ }
127
+
128
+ // Enable VAD if context is provided and not explicitly disabled
129
+ this.vadEnabled = !!this.vadContext
130
+
131
+ // Initialize managers
132
+ this.sliceManager = new SliceManager(
133
+ this.options.audioSliceSec,
134
+ this.options.maxSlicesInMemory,
135
+ )
136
+
137
+ // Set up audio stream callbacks
138
+ this.audioStream.onData(this.handleAudioData.bind(this))
139
+ this.audioStream.onError(this.handleError.bind(this))
140
+ this.audioStream.onStatusChange(this.handleAudioStatusChange.bind(this))
141
+ }
142
+
143
+ /**
144
+ * Start realtime transcription
145
+ */
146
+ async start(): Promise<void> {
147
+ if (this.isActive) {
148
+ throw new Error('Realtime transcription is already active')
149
+ }
150
+
151
+ try {
152
+ this.isActive = true
153
+ this.callbacks.onStatusChange?.(true)
154
+
155
+ // Reset all state to ensure clean start
156
+ this.reset()
157
+
158
+ // Initialize WAV file writer if output path is specified
159
+ if (this.fs && this.options.audioOutputPath) {
160
+ this.wavFileWriter = new WavFileWriter(
161
+ this.fs,
162
+ this.options.audioOutputPath,
163
+ {
164
+ sampleRate: this.options.audioStreamConfig?.sampleRate || 16000,
165
+ channels: this.options.audioStreamConfig?.channels || 1,
166
+ bitsPerSample: this.options.audioStreamConfig?.bitsPerSample || 16,
167
+ },
168
+ )
169
+ await this.wavFileWriter.initialize()
170
+ }
171
+
172
+ // Start audio recording
173
+ await this.audioStream.initialize({
174
+ sampleRate: this.options.audioStreamConfig?.sampleRate || 16000,
175
+ channels: this.options.audioStreamConfig?.channels || 1,
176
+ bitsPerSample: this.options.audioStreamConfig?.bitsPerSample || 16,
177
+ audioSource: this.options.audioStreamConfig?.audioSource || 6,
178
+ bufferSize: this.options.audioStreamConfig?.bufferSize || 16 * 1024,
179
+ })
180
+ await this.audioStream.start()
181
+
182
+ // Emit stats update for status change
183
+ this.emitStatsUpdate('status_change')
184
+
185
+ this.log('Realtime transcription started')
186
+ } catch (error) {
187
+ this.isActive = false
188
+ this.callbacks.onStatusChange?.(false)
189
+ throw error
190
+ }
191
+ }
192
+
193
+ /**
194
+ * Stop realtime transcription
195
+ */
196
+ async stop(): Promise<void> {
197
+ if (!this.isActive) {
198
+ return
199
+ }
200
+
201
+ try {
202
+ this.isActive = false
203
+
204
+ // Stop audio recording
205
+ await this.audioStream.stop()
206
+
207
+ // Process any remaining accumulated data
208
+ if (this.accumulatedData.length > 0) {
209
+ this.processAccumulatedDataForSliceManagement()
210
+ }
211
+
212
+ // Process any remaining queued transcriptions
213
+ await this.processTranscriptionQueue()
214
+
215
+ // Finalize WAV file
216
+ if (this.wavFileWriter) {
217
+ await this.wavFileWriter.finalize()
218
+ this.wavFileWriter = null
219
+ }
220
+
221
+ // Reset all state completely
222
+ this.reset()
223
+
224
+ this.callbacks.onStatusChange?.(false)
225
+
226
+ // Emit stats update for status change
227
+ this.emitStatsUpdate('status_change')
228
+
229
+ this.log('Realtime transcription stopped')
230
+ } catch (error) {
231
+ this.handleError(`Stop error: ${error}`)
232
+ }
233
+ }
234
+
235
+ /**
236
+ * Handle incoming audio data from audio stream
237
+ */
238
+ private handleAudioData(streamData: AudioStreamData): void {
239
+ if (!this.isActive) {
240
+ return
241
+ }
242
+
243
+ try {
244
+ // Write to WAV file if enabled (convert to Uint8Array for WavFileWriter)
245
+ if (this.wavFileWriter) {
246
+ this.wavFileWriter.appendAudioData(streamData.data).catch((error) => {
247
+ this.log(`Failed to write audio to WAV file: ${error}`)
248
+ })
249
+ }
250
+
251
+ // Always accumulate data for slice management
252
+ this.accumulateAudioData(streamData.data)
253
+ } catch (error) {
254
+ const errorMessage =
255
+ error instanceof Error ? error.message : 'Audio processing error'
256
+ this.handleError(errorMessage)
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Accumulate audio data for slice management
262
+ */
263
+ private accumulateAudioData(newData: Uint8Array): void {
264
+ const combined = new Uint8Array(
265
+ this.accumulatedData.length + newData.length,
266
+ )
267
+ combined.set(this.accumulatedData)
268
+ combined.set(new Uint8Array(newData), this.accumulatedData.length)
269
+ this.accumulatedData = combined
270
+
271
+ // Process accumulated data when we have enough for slice management
272
+ const minBufferSamples = 16000 * 1 // 1 second for slice management
273
+ if (this.accumulatedData.length >= minBufferSamples) {
274
+ this.processAccumulatedDataForSliceManagement()
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Process accumulated audio data through SliceManager
280
+ */
281
+ private processAccumulatedDataForSliceManagement(): void {
282
+ if (this.accumulatedData.length === 0) {
283
+ return
284
+ }
285
+
286
+ // Process through slice manager directly with Uint8Array
287
+ const result = this.sliceManager.addAudioData(this.accumulatedData)
288
+
289
+ if (result.slice) {
290
+ this.log(
291
+ `Slice ${result.slice.index} ready (${result.slice.data.length} bytes)`,
292
+ )
293
+
294
+ // Process VAD for the slice if enabled
295
+ if (!this.isTranscribing && this.vadEnabled) {
296
+ this.processSliceVAD(result.slice).catch((error: any) => {
297
+ this.handleError(`VAD processing error: ${error}`)
298
+ })
299
+ } else if (!this.isTranscribing) {
300
+ // If VAD is disabled, transcribe slices as they become ready
301
+ this.queueSliceForTranscription(result.slice).catch((error: any) => {
302
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
303
+ })
304
+ } else {
305
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`)
306
+ }
307
+
308
+ this.emitStatsUpdate('memory_change')
309
+ }
310
+
311
+ // Clear accumulated data
312
+ this.accumulatedData = new Uint8Array(0)
313
+ }
314
+
315
+ /**
316
+ * Check if auto-slice should be triggered based on VAD event and timing
317
+ */
318
+ private async checkAutoSlice(
319
+ vadEvent: RealtimeVadEvent,
320
+ _slice: any,
321
+ ): Promise<void> {
322
+ if (!this.options.autoSliceOnSpeechEnd || !this.vadEnabled) {
323
+ return
324
+ }
325
+
326
+ // Only trigger on speech_end or silence events
327
+ const shouldTriggerAutoSlice =
328
+ vadEvent.type === 'speech_end' || vadEvent.type === 'silence'
329
+
330
+ if (!shouldTriggerAutoSlice) {
331
+ return
332
+ }
333
+
334
+ // Get current slice info from SliceManager
335
+ const currentSliceInfo = this.sliceManager.getCurrentSliceInfo()
336
+ const currentSlice = this.sliceManager.getSliceByIndex(
337
+ currentSliceInfo.currentSliceIndex,
338
+ )
339
+
340
+ if (!currentSlice) {
341
+ return
342
+ }
343
+
344
+ // Calculate current slice duration
345
+ const currentDuration = (Date.now() - currentSlice.startTime) / 1000 // Convert to seconds
346
+ const targetDuration = this.options.audioSliceSec
347
+ const minDuration = this.options.audioMinSec
348
+ const autoSliceThreshold = targetDuration * this.options.autoSliceThreshold
349
+
350
+ // Check if conditions are met for auto-slice
351
+ const meetsMinDuration = currentDuration >= minDuration
352
+ const meetsThreshold = currentDuration >= autoSliceThreshold
353
+
354
+ if (meetsMinDuration && meetsThreshold) {
355
+ this.log(
356
+ `Auto-slicing on ${vadEvent.type} at ${currentDuration.toFixed(1)}s ` +
357
+ `(min: ${minDuration}s, threshold: ${autoSliceThreshold.toFixed(
358
+ 1,
359
+ )}s, target: ${targetDuration}s)`,
360
+ )
361
+
362
+ // Force next slice
363
+ await this.nextSlice()
364
+ } else {
365
+ this.log(
366
+ `Auto-slice conditions not met on ${vadEvent.type}: ` +
367
+ `duration=${currentDuration.toFixed(
368
+ 1,
369
+ )}s, min=${minDuration}s, threshold=${autoSliceThreshold.toFixed(
370
+ 1,
371
+ )}s ` +
372
+ `(minOk=${meetsMinDuration}, thresholdOk=${meetsThreshold})`,
373
+ )
374
+ }
375
+ }
376
+
377
+ /**
378
+ * Process VAD for a completed slice
379
+ */
380
+ private async processSliceVAD(slice: any): Promise<void> {
381
+ try {
382
+ // Get audio data from the slice for VAD processing
383
+ const audioData = this.sliceManager.getAudioDataForTranscription(
384
+ slice.index,
385
+ )
386
+
387
+ if (!audioData) {
388
+ this.log(
389
+ `No audio data available for VAD processing of slice ${slice.index}`,
390
+ )
391
+ return
392
+ }
393
+
394
+ // Convert base64 back to Uint8Array for VAD processing
395
+
396
+ // Detect speech in the slice
397
+ const vadEvent = await this.detectSpeech(audioData, slice.index)
398
+ vadEvent.timestamp = Date.now()
399
+
400
+ // Store VAD event for inclusion in transcribe event
401
+ this.vadEvents.set(slice.index, vadEvent)
402
+
403
+ // Emit VAD event
404
+ this.callbacks.onVad?.(vadEvent)
405
+
406
+ // Check if auto-slice should be triggered
407
+ await this.checkAutoSlice(vadEvent, slice)
408
+
409
+ // Check if speech was detected and if we should transcribe
410
+ const isSpeech =
411
+ vadEvent.type === 'speech_start' || vadEvent.type === 'speech_continue'
412
+
413
+ const isSpeechEnd = vadEvent.type === 'speech_end'
414
+
415
+ if (isSpeech) {
416
+ const minDuration = this.options.audioMinSec
417
+ // Check minimum duration requirement
418
+ const speechDuration = slice.data.length / 16000 / 2 // Convert bytes to seconds (16kHz, 16-bit)
419
+
420
+ if (speechDuration >= minDuration) {
421
+ this.log(
422
+ `Speech detected in slice ${slice.index}, queueing for transcription`,
423
+ )
424
+ await this.queueSliceForTranscription(slice)
425
+ } else {
426
+ this.log(
427
+ `Speech too short in slice ${slice.index} (${speechDuration.toFixed(
428
+ 2,
429
+ )}s < ${minDuration}s), skipping`,
430
+ )
431
+ }
432
+ } else if (isSpeechEnd) {
433
+ this.log(`Speech ended in slice ${slice.index}`)
434
+ // For speech_end events, we might want to queue the slice for transcription
435
+ // to capture the final part of the speech segment
436
+ const speechDuration = slice.data.length / 16000 / 2 // Convert bytes to seconds
437
+ const minDuration = this.options.audioMinSec
438
+
439
+ if (speechDuration >= minDuration) {
440
+ this.log(
441
+ `Speech end detected in slice ${slice.index}, queueing final segment for transcription`,
442
+ )
443
+ await this.queueSliceForTranscription(slice)
444
+ } else {
445
+ this.log(
446
+ `Speech end segment too short in slice ${
447
+ slice.index
448
+ } (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`,
449
+ )
450
+ }
451
+ } else {
452
+ this.log(`No speech detected in slice ${slice.index}`)
453
+ }
454
+
455
+ // Emit stats update for VAD change
456
+ this.emitStatsUpdate('vad_change')
457
+ } catch (error: any) {
458
+ this.handleError(
459
+ `VAD processing error for slice ${slice.index}: ${error}`,
460
+ )
461
+ }
462
+ }
463
+
464
+ /**
465
+ * Queue a slice for transcription
466
+ */
467
+ private async queueSliceForTranscription(slice: any): Promise<void> {
468
+ try {
469
+ // Get audio data from the slice
470
+ const audioData = this.sliceManager.getAudioDataForTranscription(
471
+ slice.index,
472
+ )
473
+
474
+ if (!audioData) {
475
+ this.log(`No audio data available for slice ${slice.index}`)
476
+ return
477
+ }
478
+
479
+ if (this.callbacks.onBeginTranscribe) {
480
+ const shouldTranscribe =
481
+ (await this.callbacks.onBeginTranscribe({
482
+ sliceIndex: slice.index,
483
+ audioData,
484
+ duration: (slice.data.length / 16000 / 2) * 1000, // Convert to milliseconds
485
+ vadEvent: this.vadEvents.get(slice.index),
486
+ })) ?? true
487
+
488
+ if (!shouldTranscribe) {
489
+ this.log(
490
+ `User callback declined transcription for slice ${slice.index}`,
491
+ )
492
+ return
493
+ }
494
+ }
495
+
496
+ // Add to transcription queue
497
+ this.transcriptionQueue.unshift({
498
+ sliceIndex: slice.index,
499
+ audioData,
500
+ })
501
+
502
+ this.log(
503
+ `Queued slice ${slice.index} for transcription (${slice.data.length} samples)`,
504
+ )
505
+
506
+ await this.processTranscriptionQueue()
507
+ } catch (error: any) {
508
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
509
+ }
510
+ }
511
+
512
+ /**
513
+ * Detect speech using VAD context
514
+ */
515
+ private async detectSpeech(
516
+ audioData: Uint8Array,
517
+ sliceIndex: number,
518
+ ): Promise<RealtimeVadEvent> {
519
+ if (!this.vadContext) {
520
+ // When no VAD context is available, assume speech is always detected
521
+ // but still follow the state machine pattern
522
+ const currentTimestamp = Date.now()
523
+
524
+ // Assume speech is always detected when no VAD context
525
+ const vadEventType: RealtimeVadEvent['type'] =
526
+ this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue'
527
+
528
+ // Update VAD state
529
+ this.lastVadState = 'speech'
530
+
531
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
532
+ return {
533
+ type: vadEventType,
534
+ lastSpeechDetectedTime: 0,
535
+ timestamp: currentTimestamp,
536
+ confidence: 1.0,
537
+ duration: audioData.length / sampleRate / 2, // Convert bytes to seconds
538
+ sliceIndex,
539
+ }
540
+ }
541
+
542
+ try {
543
+ const audioBuffer = audioData.buffer as ArrayBuffer
544
+
545
+ // Use VAD context to detect speech segments
546
+ const vadSegments = await this.vadContext.detectSpeechData(
547
+ audioBuffer,
548
+ this.options.vadOptions,
549
+ )
550
+
551
+ // Calculate confidence based on speech segments
552
+ let confidence = 0.0
553
+ let lastSpeechDetectedTime = 0
554
+ if (vadSegments && vadSegments.length > 0) {
555
+ // If there are speech segments, calculate average confidence
556
+ const totalTime = vadSegments.reduce(
557
+ (sum, segment) => sum + (segment.t1 - segment.t0),
558
+ 0,
559
+ )
560
+ const audioDuration = audioData.length / 16000 / 2 // Convert bytes to seconds
561
+ confidence =
562
+ totalTime > 0 ? Math.min(totalTime / audioDuration, 1.0) : 0.0
563
+ lastSpeechDetectedTime = vadSegments[vadSegments.length - 1]?.t1 || -1
564
+ }
565
+
566
+ const threshold = this.options.vadOptions.threshold || 0.5
567
+ let isSpeech = confidence > threshold
568
+ const currentTimestamp = Date.now()
569
+
570
+ // Determine VAD event type based on current and previous state
571
+ let vadEventType: RealtimeVadEvent['type']
572
+ if (isSpeech) {
573
+ vadEventType =
574
+ this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue'
575
+
576
+ const minDuration = this.options.audioMinSec
577
+ // Check if this is a new speech detection (different from last detected time)
578
+ if (
579
+ lastSpeechDetectedTime === this.lastSpeechDetectedTime ||
580
+ (lastSpeechDetectedTime - this.lastSpeechDetectedTime) / 100 <
581
+ minDuration
582
+ ) {
583
+ if (this.lastVadState === 'silence') vadEventType = 'silence'
584
+ if (this.lastVadState === 'speech') vadEventType = 'speech_end'
585
+ isSpeech = false
586
+ confidence = 0.0
587
+ }
588
+ this.lastSpeechDetectedTime = lastSpeechDetectedTime
589
+ } else {
590
+ vadEventType = this.lastVadState === 'speech' ? 'speech_end' : 'silence'
591
+ }
592
+
593
+ // Update VAD state for next detection
594
+ this.lastVadState = isSpeech ? 'speech' : 'silence'
595
+
596
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
597
+ return {
598
+ type: vadEventType,
599
+ lastSpeechDetectedTime,
600
+ timestamp: currentTimestamp,
601
+ confidence,
602
+ duration: audioData.length / sampleRate / 2, // Convert bytes to seconds
603
+ sliceIndex,
604
+ currentThreshold: threshold,
605
+ }
606
+ } catch (error) {
607
+ this.log(`VAD detection error: ${error}`)
608
+ // Re-throw the error so it can be handled by the caller
609
+ throw error
610
+ }
611
+ }
612
+
613
+ private isProcessingTranscriptionQueue = false
614
+
615
+ /**
616
+ * Process the transcription queue
617
+ */
618
+ private async processTranscriptionQueue(): Promise<void> {
619
+ if (this.isProcessingTranscriptionQueue) return
620
+
621
+ this.isProcessingTranscriptionQueue = true
622
+
623
+ while (this.transcriptionQueue.length > 0) {
624
+ const item = this.transcriptionQueue.shift()
625
+ this.transcriptionQueue = [] // Old items are not needed anymore
626
+ if (item) {
627
+ // eslint-disable-next-line no-await-in-loop
628
+ await this.processTranscription(item).catch((error) => {
629
+ this.handleError(`Transcription error: ${error}`)
630
+ })
631
+ }
632
+ }
633
+
634
+ this.isProcessingTranscriptionQueue = false
635
+ }
636
+
637
+ /**
638
+ * Build prompt from initial prompt and previous slices
639
+ */
640
+ private buildPrompt(currentSliceIndex: number): string | undefined {
641
+ const promptParts: string[] = []
642
+
643
+ // Add initial prompt if provided
644
+ if (this.options.initialPrompt) {
645
+ promptParts.push(this.options.initialPrompt)
646
+ }
647
+
648
+ // Add previous slice results if enabled
649
+ if (this.options.promptPreviousSlices) {
650
+ // Get transcription results from previous slices (up to the current slice)
651
+ const previousResults = Array.from(this.transcriptionResults.entries())
652
+ .filter(([sliceIndex]) => sliceIndex < currentSliceIndex)
653
+ .sort(([a], [b]) => a - b) // Sort by slice index
654
+ .map(([, result]) => result.transcribeEvent.data?.result)
655
+ .filter((result): result is string => Boolean(result)) // Filter out empty results with type guard
656
+
657
+ if (previousResults.length > 0) {
658
+ promptParts.push(...previousResults)
659
+ }
660
+ }
661
+
662
+ return promptParts.join(' ') || undefined
663
+ }
664
+
665
+ /**
666
+ * Process a single transcription
667
+ */
668
+ private async processTranscription(item: {
669
+ sliceIndex: number
670
+ audioData: Uint8Array
671
+ }): Promise<void> {
672
+ if (!this.isActive) {
673
+ return
674
+ }
675
+
676
+ this.isTranscribing = true
677
+
678
+ // Emit stats update for status change
679
+ this.emitStatsUpdate('status_change')
680
+
681
+ const startTime = Date.now()
682
+
683
+ try {
684
+ // Build prompt from initial prompt and previous slices
685
+ const prompt = this.buildPrompt(item.sliceIndex)
686
+
687
+ const audioBuffer = item.audioData.buffer as ArrayBuffer
688
+ const { promise } = this.whisperContext.transcribeData(audioBuffer, {
689
+ ...this.options.transcribeOptions,
690
+ prompt, // Include the constructed prompt
691
+ onProgress: undefined, // Disable progress for realtime
692
+ })
693
+
694
+ const result = await promise
695
+ const endTime = Date.now()
696
+
697
+ // Create transcribe event
698
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
699
+ const transcribeEvent: RealtimeTranscribeEvent = {
700
+ type: 'transcribe',
701
+ sliceIndex: item.sliceIndex,
702
+ data: result,
703
+ isCapturing: this.audioStream.isRecording(),
704
+ processTime: endTime - startTime,
705
+ recordingTime: item.audioData.length / (sampleRate / 1000) / 2, // ms,
706
+ memoryUsage: this.sliceManager.getMemoryUsage(),
707
+ vadEvent: this.vadEvents.get(item.sliceIndex),
708
+ }
709
+
710
+ // Save transcription results
711
+ const slice = this.sliceManager.getSliceByIndex(item.sliceIndex)
712
+ if (slice) {
713
+ this.transcriptionResults.set(item.sliceIndex, {
714
+ slice: {
715
+ // Don't keep data in the slice
716
+ index: slice.index,
717
+ sampleCount: slice.sampleCount,
718
+ startTime: slice.startTime,
719
+ endTime: slice.endTime,
720
+ isProcessed: slice.isProcessed,
721
+ isReleased: slice.isReleased,
722
+ },
723
+ transcribeEvent,
724
+ })
725
+ }
726
+
727
+ // Emit transcribe event
728
+ this.callbacks.onTranscribe?.(transcribeEvent)
729
+
730
+ this.vadEvents.delete(item.sliceIndex)
731
+
732
+ // Emit stats update for memory/slice changes
733
+ this.emitStatsUpdate('memory_change')
734
+
735
+ this.log(
736
+ `Transcribed speech segment ${item.sliceIndex}: "${result.result}"`,
737
+ )
738
+ } catch (error) {
739
+ // Emit error event to transcribe callback
740
+ const errorEvent: RealtimeTranscribeEvent = {
741
+ type: 'error',
742
+ sliceIndex: item.sliceIndex,
743
+ data: undefined,
744
+ isCapturing: this.audioStream.isRecording(),
745
+ processTime: Date.now() - startTime,
746
+ recordingTime: 0,
747
+ memoryUsage: this.sliceManager.getMemoryUsage(),
748
+ vadEvent: this.vadEvents.get(item.sliceIndex),
749
+ }
750
+
751
+ this.callbacks.onTranscribe?.(errorEvent)
752
+
753
+ this.vadEvents.delete(item.sliceIndex)
754
+
755
+ this.handleError(
756
+ `Transcription failed for speech segment ${item.sliceIndex}: ${error}`,
757
+ )
758
+ } finally {
759
+ // Check if we should continue processing queue
760
+ if (this.transcriptionQueue.length > 0) {
761
+ await this.processTranscriptionQueue()
762
+ } else {
763
+ this.isTranscribing = false
764
+ }
765
+ }
766
+ }
767
+
768
+ /**
769
+ * Handle audio status changes
770
+ */
771
+ private handleAudioStatusChange(isRecording: boolean): void {
772
+ this.log(`Audio recording: ${isRecording ? 'started' : 'stopped'}`)
773
+ }
774
+
775
+ /**
776
+ * Handle errors from components
777
+ */
778
+ private handleError(error: string): void {
779
+ this.log(`Error: ${error}`)
780
+ this.callbacks.onError?.(error)
781
+ }
782
+
783
+ /**
784
+ * Update callbacks
785
+ */
786
+ updateCallbacks(callbacks: Partial<RealtimeTranscriberCallbacks>): void {
787
+ this.callbacks = { ...this.callbacks, ...callbacks }
788
+ }
789
+
790
+ /**
791
+ * Update VAD options dynamically
792
+ */
793
+ updateVadOptions(options: Partial<VadOptions>): void {
794
+ this.options.vadOptions = { ...this.options.vadOptions, ...options }
795
+ }
796
+
797
+ /**
798
+ * Update auto-slice options dynamically
799
+ */
800
+ updateAutoSliceOptions(options: {
801
+ autoSliceOnSpeechEnd?: boolean
802
+ autoSliceThreshold?: number
803
+ }): void {
804
+ if (options.autoSliceOnSpeechEnd !== undefined) {
805
+ this.options.autoSliceOnSpeechEnd = options.autoSliceOnSpeechEnd
806
+ }
807
+ if (options.autoSliceThreshold !== undefined) {
808
+ this.options.autoSliceThreshold = options.autoSliceThreshold
809
+ }
810
+ this.log(
811
+ `Auto-slice options updated: enabled=${this.options.autoSliceOnSpeechEnd}, threshold=${this.options.autoSliceThreshold}`,
812
+ )
813
+ }
814
+
815
+ /**
816
+ * Get current statistics
817
+ */
818
+ getStatistics() {
819
+ return {
820
+ isActive: this.isActive,
821
+ isTranscribing: this.isTranscribing,
822
+ vadEnabled: this.vadEnabled,
823
+ audioStats: {
824
+ isRecording: this.audioStream.isRecording(),
825
+ accumulatedSamples: this.accumulatedData.length,
826
+ },
827
+ vadStats: this.vadEnabled
828
+ ? {
829
+ enabled: true,
830
+ contextAvailable: !!this.vadContext,
831
+ lastSpeechDetectedTime: this.lastSpeechDetectedTime,
832
+ }
833
+ : null,
834
+ sliceStats: this.sliceManager.getCurrentSliceInfo(),
835
+ autoSliceConfig: {
836
+ enabled: this.options.autoSliceOnSpeechEnd,
837
+ threshold: this.options.autoSliceThreshold,
838
+ targetDuration: this.options.audioSliceSec,
839
+ minDuration: this.options.audioMinSec,
840
+ },
841
+ }
842
+ }
843
+
844
+ /**
845
+ * Get all transcription results
846
+ */
847
+ getTranscriptionResults(): Array<{
848
+ slice: AudioSliceNoData
849
+ transcribeEvent: RealtimeTranscribeEvent
850
+ }> {
851
+ return Array.from(this.transcriptionResults.values())
852
+ }
853
+
854
+ /**
855
+ * Force move to the next slice, finalizing the current one regardless of capacity
856
+ */
857
+ async nextSlice(): Promise<void> {
858
+ if (!this.isActive) {
859
+ this.log('Cannot force next slice - transcriber is not active')
860
+ return
861
+ }
862
+
863
+ // Emit start event to indicate slice processing has started
864
+ const startEvent: RealtimeTranscribeEvent = {
865
+ type: 'start',
866
+ sliceIndex: -1, // Use -1 to indicate forced slice
867
+ data: undefined,
868
+ isCapturing: this.audioStream.isRecording(),
869
+ processTime: 0,
870
+ recordingTime: 0,
871
+ memoryUsage: this.sliceManager.getMemoryUsage(),
872
+ }
873
+
874
+ this.callbacks.onTranscribe?.(startEvent)
875
+
876
+ // Check if there are pending transcriptions or currently transcribing
877
+ if (this.isTranscribing || this.transcriptionQueue.length > 0) {
878
+ this.log(
879
+ 'Waiting for pending transcriptions to complete before forcing next slice...',
880
+ )
881
+
882
+ // Wait for current transcription queue to be processed
883
+ await this.processTranscriptionQueue()
884
+ }
885
+
886
+ const result = this.sliceManager.forceNextSlice()
887
+
888
+ if (result.slice) {
889
+ this.log(
890
+ `Forced slice ${result.slice.index} ready (${result.slice.data.length} bytes)`,
891
+ )
892
+
893
+ // Process VAD for the slice if enabled
894
+ if (!this.isTranscribing && this.vadEnabled) {
895
+ this.processSliceVAD(result.slice).catch((error: any) => {
896
+ this.handleError(`VAD processing error: ${error}`)
897
+ })
898
+ } else if (!this.isTranscribing) {
899
+ // If VAD is disabled, transcribe slices as they become ready
900
+ this.queueSliceForTranscription(result.slice).catch((error: any) => {
901
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
902
+ })
903
+ } else {
904
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`)
905
+ }
906
+
907
+ this.emitStatsUpdate('memory_change')
908
+ } else {
909
+ this.log('Forced next slice but no slice data to process')
910
+ }
911
+ }
912
+
913
+ /**
914
+ * Reset all components
915
+ */
916
+ reset(): void {
917
+ this.sliceManager.reset()
918
+ this.transcriptionQueue = []
919
+ this.isTranscribing = false
920
+ this.accumulatedData = new Uint8Array(0)
921
+
922
+ // Reset simplified VAD state
923
+ this.lastSpeechDetectedTime = -1
924
+ this.lastVadState = 'silence'
925
+
926
+ // Reset stats snapshot for clean start
927
+ this.lastStatsSnapshot = null
928
+
929
+ // Cancel WAV file writing if in progress
930
+ if (this.wavFileWriter) {
931
+ this.wavFileWriter.cancel().catch((error) => {
932
+ this.log(`Failed to cancel WAV file writing: ${error}`)
933
+ })
934
+ this.wavFileWriter = null
935
+ }
936
+
937
+ // Clear transcription results
938
+ this.transcriptionResults.clear()
939
+
940
+ // Clear VAD events
941
+ this.vadEvents.clear()
942
+ }
943
+
944
+ /**
945
+ * Release all resources
946
+ */
947
+ async release(): Promise<void> {
948
+ if (this.isActive) {
949
+ await this.stop()
950
+ }
951
+
952
+ await this.audioStream.release()
953
+ await this.wavFileWriter?.finalize()
954
+ this.vadContext = undefined
955
+ }
956
+
957
+ /**
958
+ * Emit stats update event if stats have changed significantly
959
+ */
960
+ private emitStatsUpdate(eventType: RealtimeStatsEvent['type']): void {
961
+ const currentStats = this.getStatistics()
962
+
963
+ // Check if stats have changed significantly
964
+ if (
965
+ !this.lastStatsSnapshot ||
966
+ RealtimeTranscriber.shouldEmitStatsUpdate(
967
+ currentStats,
968
+ this.lastStatsSnapshot,
969
+ )
970
+ ) {
971
+ const statsEvent: RealtimeStatsEvent = {
972
+ timestamp: Date.now(),
973
+ type: eventType,
974
+ data: currentStats,
975
+ }
976
+
977
+ this.callbacks.onStatsUpdate?.(statsEvent)
978
+ this.lastStatsSnapshot = { ...currentStats }
979
+ }
980
+ }
981
+
982
+ /**
983
+ * Determine if stats update should be emitted
984
+ */
985
+ private static shouldEmitStatsUpdate(current: any, previous: any): boolean {
986
+ // Always emit on status changes
987
+ if (
988
+ current.isActive !== previous.isActive ||
989
+ current.isTranscribing !== previous.isTranscribing
990
+ ) {
991
+ return true
992
+ }
993
+
994
+ // Emit on significant memory changes (>10% or >5MB)
995
+ const currentMemory = current.sliceStats?.memoryUsage?.estimatedMB || 0
996
+ const previousMemory = previous.sliceStats?.memoryUsage?.estimatedMB || 0
997
+ const memoryDiff = Math.abs(currentMemory - previousMemory)
998
+
999
+ if (
1000
+ memoryDiff > 5 ||
1001
+ (previousMemory > 0 && memoryDiff / previousMemory > 0.1)
1002
+ ) {
1003
+ return true
1004
+ }
1005
+
1006
+ return false
1007
+ }
1008
+
1009
+ /**
1010
+ * Logger function
1011
+ */
1012
+ private log(message: string): void {
1013
+ this.options.logger(`[RealtimeTranscriber] ${message}`)
1014
+ }
1015
+ }