whisper.rn 0.5.0-rc.1 → 0.5.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +119 -50
  2. package/android/src/main/java/com/rnwhisper/RNWhisper.java +26 -0
  3. package/android/src/main/java/com/rnwhisper/WhisperContext.java +25 -0
  4. package/android/src/main/jni.cpp +81 -0
  5. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  6. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  7. package/ios/RNWhisper.mm +11 -0
  8. package/ios/RNWhisperContext.h +1 -0
  9. package/ios/RNWhisperContext.mm +46 -0
  10. package/lib/commonjs/AudioSessionIos.js +2 -1
  11. package/lib/commonjs/AudioSessionIos.js.map +1 -1
  12. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  13. package/lib/commonjs/index.js +26 -0
  14. package/lib/commonjs/index.js.map +1 -1
  15. package/lib/commonjs/jest-mock.js +126 -0
  16. package/lib/commonjs/jest-mock.js.map +1 -0
  17. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +831 -0
  18. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  19. package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
  20. package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
  21. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
  22. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  23. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
  24. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  25. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
  26. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  27. package/lib/commonjs/realtime-transcription/index.js +27 -0
  28. package/lib/commonjs/realtime-transcription/index.js.map +1 -0
  29. package/lib/commonjs/realtime-transcription/types.js +114 -0
  30. package/lib/commonjs/realtime-transcription/types.js.map +1 -0
  31. package/lib/commonjs/utils/WavFileReader.js +158 -0
  32. package/lib/commonjs/utils/WavFileReader.js.map +1 -0
  33. package/lib/commonjs/utils/WavFileWriter.js +181 -0
  34. package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
  35. package/lib/commonjs/utils/common.js +25 -0
  36. package/lib/commonjs/utils/common.js.map +1 -0
  37. package/lib/module/AudioSessionIos.js +2 -1
  38. package/lib/module/AudioSessionIos.js.map +1 -1
  39. package/lib/module/NativeRNWhisper.js.map +1 -1
  40. package/lib/module/index.js +24 -0
  41. package/lib/module/index.js.map +1 -1
  42. package/lib/module/jest-mock.js +124 -0
  43. package/lib/module/jest-mock.js.map +1 -0
  44. package/lib/module/realtime-transcription/RealtimeTranscriber.js +825 -0
  45. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  46. package/lib/module/realtime-transcription/SliceManager.js +226 -0
  47. package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
  48. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
  49. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  50. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
  51. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  52. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
  53. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  54. package/lib/module/realtime-transcription/index.js +8 -0
  55. package/lib/module/realtime-transcription/index.js.map +1 -0
  56. package/lib/module/realtime-transcription/types.js +107 -0
  57. package/lib/module/realtime-transcription/types.js.map +1 -0
  58. package/lib/module/utils/WavFileReader.js +151 -0
  59. package/lib/module/utils/WavFileReader.js.map +1 -0
  60. package/lib/module/utils/WavFileWriter.js +174 -0
  61. package/lib/module/utils/WavFileWriter.js.map +1 -0
  62. package/lib/module/utils/common.js +18 -0
  63. package/lib/module/utils/common.js.map +1 -0
  64. package/lib/typescript/AudioSessionIos.d.ts +1 -1
  65. package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
  66. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  67. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  68. package/lib/typescript/index.d.ts +4 -0
  69. package/lib/typescript/index.d.ts.map +1 -1
  70. package/lib/typescript/jest-mock.d.ts +2 -0
  71. package/lib/typescript/jest-mock.d.ts.map +1 -0
  72. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +165 -0
  73. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
  74. package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
  75. package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
  76. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
  77. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
  78. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
  79. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
  80. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
  81. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
  82. package/lib/typescript/realtime-transcription/index.d.ts +6 -0
  83. package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
  84. package/lib/typescript/realtime-transcription/types.d.ts +216 -0
  85. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
  86. package/lib/typescript/utils/WavFileReader.d.ts +61 -0
  87. package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
  88. package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
  89. package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
  90. package/lib/typescript/utils/common.d.ts +9 -0
  91. package/lib/typescript/utils/common.d.ts.map +1 -0
  92. package/package.json +18 -6
  93. package/src/AudioSessionIos.ts +3 -2
  94. package/src/NativeRNWhisper.ts +2 -0
  95. package/src/index.ts +34 -0
  96. package/{jest/mock.js → src/jest-mock.ts} +2 -2
  97. package/src/realtime-transcription/RealtimeTranscriber.ts +983 -0
  98. package/src/realtime-transcription/SliceManager.ts +252 -0
  99. package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
  100. package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
  101. package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
  102. package/src/realtime-transcription/index.ts +34 -0
  103. package/src/realtime-transcription/types.ts +277 -0
  104. package/src/utils/WavFileReader.ts +202 -0
  105. package/src/utils/WavFileWriter.ts +206 -0
  106. package/src/utils/common.ts +17 -0
@@ -0,0 +1,983 @@
1
+ /* eslint-disable class-methods-use-this */
2
+ import type { VadOptions } from '../index'
3
+ import { SliceManager } from './SliceManager'
4
+ import { WavFileWriter, WavFileWriterFs } from '../utils/WavFileWriter'
5
+ import type {
6
+ RealtimeOptions,
7
+ RealtimeTranscribeEvent,
8
+ RealtimeVadEvent,
9
+ RealtimeTranscriberCallbacks,
10
+ RealtimeStatsEvent,
11
+ RealtimeTranscriberDependencies,
12
+ AudioStreamData,
13
+ AudioSliceNoData,
14
+ AudioStreamInterface,
15
+ AudioStreamConfig,
16
+ WhisperContextLike,
17
+ WhisperVadContextLike,
18
+ } from './types'
19
+ import { VAD_PRESETS } from './types'
20
+
21
+ /**
22
+ * RealtimeTranscriber provides real-time audio transcription with VAD support.
23
+ *
24
+ * Features:
25
+ * - Automatic slice management based on duration
26
+ * - VAD-based speech detection and auto-slicing
27
+ * - Configurable auto-slice mechanism that triggers on speech_end/silence events
28
+ * - Memory management for audio slices
29
+ * - Queue-based transcription processing
30
+ */
31
+ export class RealtimeTranscriber {
32
+ private whisperContext: WhisperContextLike
33
+
34
+ private vadContext?: WhisperVadContextLike
35
+
36
+ private audioStream: AudioStreamInterface
37
+
38
+ private fs?: WavFileWriterFs
39
+
40
+ private sliceManager: SliceManager
41
+
42
+ private callbacks: RealtimeTranscriberCallbacks = {}
43
+
44
+ private options: {
45
+ audioSliceSec: number
46
+ audioMinSec: number
47
+ maxSlicesInMemory: number
48
+ vadOptions: VadOptions
49
+ vadPreset?: keyof typeof VAD_PRESETS
50
+ autoSliceOnSpeechEnd: boolean
51
+ autoSliceThreshold: number
52
+ transcribeOptions: any
53
+ initialPrompt?: string
54
+ promptPreviousSlices: boolean
55
+ audioOutputPath?: string
56
+ audioStreamConfig?: AudioStreamConfig
57
+ logger: (message: string) => void
58
+ }
59
+
60
+ private isActive = false
61
+
62
+ private isTranscribing = false
63
+
64
+ private vadEnabled = false
65
+
66
+ private transcriptionQueue: Array<{
67
+ sliceIndex: number
68
+ audioData: Uint8Array
69
+ }> = []
70
+
71
+ private accumulatedData: Uint8Array = new Uint8Array(0)
72
+
73
+ private wavFileWriter: WavFileWriter | null = null
74
+
75
+ // Simplified VAD state management
76
+ private lastSpeechDetectedTime = 0
77
+
78
+ // Track VAD state for proper event transitions
79
+ private lastVadState: 'speech' | 'silence' = 'silence'
80
+
81
+ // Track last stats to emit only when changed
82
+ private lastStatsSnapshot: any = null
83
+
84
+ // Store transcription results by slice index
85
+ private transcriptionResults: Map<
86
+ number,
87
+ { slice: AudioSliceNoData; transcribeEvent: RealtimeTranscribeEvent }
88
+ > = new Map()
89
+
90
+ constructor(
91
+ dependencies: RealtimeTranscriberDependencies,
92
+ options: RealtimeOptions = {},
93
+ callbacks: RealtimeTranscriberCallbacks = {},
94
+ ) {
95
+ this.whisperContext = dependencies.whisperContext
96
+ this.vadContext = dependencies.vadContext
97
+ this.audioStream = dependencies.audioStream
98
+ this.fs = dependencies.fs
99
+ this.callbacks = callbacks
100
+
101
+ // Set default options with proper types
102
+ this.options = {
103
+ audioSliceSec: options.audioSliceSec || 30,
104
+ audioMinSec: options.audioMinSec || 1,
105
+ maxSlicesInMemory: options.maxSlicesInMemory || 3,
106
+ vadOptions: options.vadOptions || VAD_PRESETS.default,
107
+ vadPreset: options.vadPreset,
108
+ autoSliceOnSpeechEnd: options.autoSliceOnSpeechEnd || true,
109
+ autoSliceThreshold: options.autoSliceThreshold || 0.5,
110
+ transcribeOptions: options.transcribeOptions || {},
111
+ initialPrompt: options.initialPrompt,
112
+ promptPreviousSlices: options.promptPreviousSlices ?? true,
113
+ audioOutputPath: options.audioOutputPath,
114
+ logger: options.logger || (() => {}),
115
+ }
116
+
117
+ // Apply VAD preset if specified
118
+ if (this.options.vadPreset && VAD_PRESETS[this.options.vadPreset]) {
119
+ this.options.vadOptions = {
120
+ ...VAD_PRESETS[this.options.vadPreset],
121
+ ...this.options.vadOptions,
122
+ }
123
+ }
124
+
125
+ // Enable VAD if context is provided and not explicitly disabled
126
+ this.vadEnabled = !!this.vadContext
127
+
128
+ // Initialize managers
129
+ this.sliceManager = new SliceManager(
130
+ this.options.audioSliceSec,
131
+ this.options.maxSlicesInMemory,
132
+ )
133
+
134
+ // Set up audio stream callbacks
135
+ this.audioStream.onData(this.handleAudioData.bind(this))
136
+ this.audioStream.onError(this.handleError.bind(this))
137
+ this.audioStream.onStatusChange(this.handleAudioStatusChange.bind(this))
138
+ }
139
+
140
+ /**
141
+ * Start realtime transcription
142
+ */
143
+ async start(): Promise<void> {
144
+ if (this.isActive) {
145
+ throw new Error('Realtime transcription is already active')
146
+ }
147
+
148
+ try {
149
+ this.isActive = true
150
+ this.callbacks.onStatusChange?.(true)
151
+
152
+ // Reset all state to ensure clean start
153
+ this.reset()
154
+
155
+ // Initialize WAV file writer if output path is specified
156
+ if (this.fs && this.options.audioOutputPath) {
157
+ this.wavFileWriter = new WavFileWriter(
158
+ this.fs,
159
+ this.options.audioOutputPath,
160
+ {
161
+ sampleRate: this.options.audioStreamConfig?.sampleRate || 16000,
162
+ channels: this.options.audioStreamConfig?.channels || 1,
163
+ bitsPerSample: this.options.audioStreamConfig?.bitsPerSample || 16,
164
+ },
165
+ )
166
+ await this.wavFileWriter.initialize()
167
+ }
168
+
169
+ // Start audio recording
170
+ await this.audioStream.initialize({
171
+ sampleRate: this.options.audioStreamConfig?.sampleRate || 16000,
172
+ channels: this.options.audioStreamConfig?.channels || 1,
173
+ bitsPerSample: this.options.audioStreamConfig?.bitsPerSample || 16,
174
+ audioSource: this.options.audioStreamConfig?.audioSource || 6,
175
+ bufferSize: this.options.audioStreamConfig?.bufferSize || 16 * 1024,
176
+ })
177
+ await this.audioStream.start()
178
+
179
+ // Emit stats update for status change
180
+ this.emitStatsUpdate('status_change')
181
+
182
+ this.log('Realtime transcription started')
183
+ } catch (error) {
184
+ this.isActive = false
185
+ this.callbacks.onStatusChange?.(false)
186
+ throw error
187
+ }
188
+ }
189
+
190
+ /**
191
+ * Stop realtime transcription
192
+ */
193
+ async stop(): Promise<void> {
194
+ if (!this.isActive) {
195
+ return
196
+ }
197
+
198
+ try {
199
+ this.isActive = false
200
+
201
+ // Stop audio recording
202
+ await this.audioStream.stop()
203
+
204
+ // Process any remaining accumulated data
205
+ if (this.accumulatedData.length > 0) {
206
+ this.processAccumulatedDataForSliceManagement()
207
+ }
208
+
209
+ // Process any remaining queued transcriptions
210
+ await this.processTranscriptionQueue()
211
+
212
+ // Finalize WAV file
213
+ if (this.wavFileWriter) {
214
+ await this.wavFileWriter.finalize()
215
+ this.wavFileWriter = null
216
+ }
217
+
218
+ // Reset all state completely
219
+ this.reset()
220
+
221
+ this.callbacks.onStatusChange?.(false)
222
+
223
+ // Emit stats update for status change
224
+ this.emitStatsUpdate('status_change')
225
+
226
+ this.log('Realtime transcription stopped')
227
+ } catch (error) {
228
+ this.handleError(`Stop error: ${error}`)
229
+ }
230
+ }
231
+
232
+ /**
233
+ * Handle incoming audio data from audio stream
234
+ */
235
+ private handleAudioData(streamData: AudioStreamData): void {
236
+ if (!this.isActive) {
237
+ return
238
+ }
239
+
240
+ try {
241
+ // Write to WAV file if enabled (convert to Uint8Array for WavFileWriter)
242
+ if (this.wavFileWriter) {
243
+ this.wavFileWriter.appendAudioData(streamData.data).catch((error) => {
244
+ this.log(`Failed to write audio to WAV file: ${error}`)
245
+ })
246
+ }
247
+
248
+ // Always accumulate data for slice management
249
+ this.accumulateAudioData(streamData.data)
250
+ } catch (error) {
251
+ const errorMessage =
252
+ error instanceof Error ? error.message : 'Audio processing error'
253
+ this.handleError(errorMessage)
254
+ }
255
+ }
256
+
257
+ /**
258
+ * Accumulate audio data for slice management
259
+ */
260
+ private accumulateAudioData(newData: Uint8Array): void {
261
+ const combined = new Uint8Array(
262
+ this.accumulatedData.length + newData.length,
263
+ )
264
+ combined.set(this.accumulatedData)
265
+ combined.set(new Uint8Array(newData), this.accumulatedData.length)
266
+ this.accumulatedData = combined
267
+
268
+ // Process accumulated data when we have enough for slice management
269
+ const minBufferSamples = 16000 * 1 // 1 second for slice management
270
+ if (this.accumulatedData.length >= minBufferSamples) {
271
+ this.processAccumulatedDataForSliceManagement()
272
+ }
273
+ }
274
+
275
+ /**
276
+ * Process accumulated audio data through SliceManager
277
+ */
278
+ private processAccumulatedDataForSliceManagement(): void {
279
+ if (this.accumulatedData.length === 0) {
280
+ return
281
+ }
282
+
283
+ // Process through slice manager directly with Uint8Array
284
+ const result = this.sliceManager.addAudioData(this.accumulatedData)
285
+
286
+ if (result.slice) {
287
+ this.log(
288
+ `Slice ${result.slice.index} ready (${result.slice.data.length} bytes)`,
289
+ )
290
+
291
+ // Process VAD for the slice if enabled
292
+ if (!this.isTranscribing && this.vadEnabled) {
293
+ this.processSliceVAD(result.slice).catch((error: any) => {
294
+ this.handleError(`VAD processing error: ${error}`)
295
+ })
296
+ } else if (!this.isTranscribing) {
297
+ // If VAD is disabled, transcribe slices as they become ready
298
+ this.queueSliceForTranscription(result.slice).catch((error: any) => {
299
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
300
+ })
301
+ } else {
302
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`)
303
+ }
304
+
305
+ this.emitStatsUpdate('memory_change')
306
+ }
307
+
308
+ // Clear accumulated data
309
+ this.accumulatedData = new Uint8Array(0)
310
+ }
311
+
312
+ /**
313
+ * Check if auto-slice should be triggered based on VAD event and timing
314
+ */
315
+ private async checkAutoSlice(
316
+ vadEvent: RealtimeVadEvent,
317
+ _slice: any,
318
+ ): Promise<void> {
319
+ if (!this.options.autoSliceOnSpeechEnd || !this.vadEnabled) {
320
+ return
321
+ }
322
+
323
+ // Only trigger on speech_end or silence events
324
+ const shouldTriggerAutoSlice =
325
+ vadEvent.type === 'speech_end' || vadEvent.type === 'silence'
326
+
327
+ if (!shouldTriggerAutoSlice) {
328
+ return
329
+ }
330
+
331
+ // Get current slice info from SliceManager
332
+ const currentSliceInfo = this.sliceManager.getCurrentSliceInfo()
333
+ const currentSlice = this.sliceManager.getSliceByIndex(
334
+ currentSliceInfo.currentSliceIndex,
335
+ )
336
+
337
+ if (!currentSlice) {
338
+ return
339
+ }
340
+
341
+ // Calculate current slice duration
342
+ const currentDuration = (Date.now() - currentSlice.startTime) / 1000 // Convert to seconds
343
+ const targetDuration = this.options.audioSliceSec
344
+ const minDuration = this.options.audioMinSec
345
+ const autoSliceThreshold = targetDuration * this.options.autoSliceThreshold
346
+
347
+ // Check if conditions are met for auto-slice
348
+ const meetsMinDuration = currentDuration >= minDuration
349
+ const meetsThreshold = currentDuration >= autoSliceThreshold
350
+
351
+ if (meetsMinDuration && meetsThreshold) {
352
+ this.log(
353
+ `Auto-slicing on ${vadEvent.type} at ${currentDuration.toFixed(1)}s ` +
354
+ `(min: ${minDuration}s, threshold: ${autoSliceThreshold.toFixed(
355
+ 1,
356
+ )}s, target: ${targetDuration}s)`,
357
+ )
358
+
359
+ // Force next slice
360
+ await this.nextSlice()
361
+ } else {
362
+ this.log(
363
+ `Auto-slice conditions not met on ${vadEvent.type}: ` +
364
+ `duration=${currentDuration.toFixed(
365
+ 1,
366
+ )}s, min=${minDuration}s, threshold=${autoSliceThreshold.toFixed(
367
+ 1,
368
+ )}s ` +
369
+ `(minOk=${meetsMinDuration}, thresholdOk=${meetsThreshold})`,
370
+ )
371
+ }
372
+ }
373
+
374
+ /**
375
+ * Process VAD for a completed slice
376
+ */
377
+ private async processSliceVAD(slice: any): Promise<void> {
378
+ try {
379
+ // Get audio data from the slice for VAD processing
380
+ const audioData = this.sliceManager.getAudioDataForTranscription(
381
+ slice.index,
382
+ )
383
+
384
+ if (!audioData) {
385
+ this.log(
386
+ `No audio data available for VAD processing of slice ${slice.index}`,
387
+ )
388
+ return
389
+ }
390
+
391
+ // Convert base64 back to Uint8Array for VAD processing
392
+
393
+ // Detect speech in the slice
394
+ const vadEvent = await this.detectSpeech(audioData, slice.index)
395
+ vadEvent.timestamp = Date.now()
396
+
397
+ // Emit VAD event
398
+ this.callbacks.onVad?.(vadEvent)
399
+
400
+ // Check if auto-slice should be triggered
401
+ await this.checkAutoSlice(vadEvent, slice)
402
+
403
+ // Check if speech was detected and if we should transcribe
404
+ const isSpeech =
405
+ vadEvent.type === 'speech_start' || vadEvent.type === 'speech_continue'
406
+
407
+ const isSpeechEnd = vadEvent.type === 'speech_end'
408
+
409
+ if (isSpeech) {
410
+ const minDuration = this.options.audioMinSec
411
+ // Check minimum duration requirement
412
+ const speechDuration = slice.data.length / 16000 / 2 // Convert bytes to seconds (16kHz, 16-bit)
413
+
414
+ if (speechDuration >= minDuration) {
415
+ this.log(
416
+ `Speech detected in slice ${slice.index}, queueing for transcription`,
417
+ )
418
+ await this.queueSliceForTranscription(slice)
419
+ } else {
420
+ this.log(
421
+ `Speech too short in slice ${slice.index} (${speechDuration.toFixed(
422
+ 2,
423
+ )}s < ${minDuration}s), skipping`,
424
+ )
425
+ }
426
+ } else if (isSpeechEnd) {
427
+ this.log(`Speech ended in slice ${slice.index}`)
428
+ // For speech_end events, we might want to queue the slice for transcription
429
+ // to capture the final part of the speech segment
430
+ const speechDuration = slice.data.length / 16000 / 2 // Convert bytes to seconds
431
+ const minDuration = this.options.audioMinSec
432
+
433
+ if (speechDuration >= minDuration) {
434
+ this.log(
435
+ `Speech end detected in slice ${slice.index}, queueing final segment for transcription`,
436
+ )
437
+ await this.queueSliceForTranscription(slice)
438
+ } else {
439
+ this.log(
440
+ `Speech end segment too short in slice ${
441
+ slice.index
442
+ } (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`,
443
+ )
444
+ }
445
+ } else {
446
+ this.log(`No speech detected in slice ${slice.index}`)
447
+ }
448
+
449
+ // Emit stats update for VAD change
450
+ this.emitStatsUpdate('vad_change')
451
+ } catch (error: any) {
452
+ this.handleError(
453
+ `VAD processing error for slice ${slice.index}: ${error}`,
454
+ )
455
+ }
456
+ }
457
+
458
+ /**
459
+ * Queue a slice for transcription
460
+ */
461
+ private async queueSliceForTranscription(slice: any): Promise<void> {
462
+ try {
463
+ // Get audio data from the slice
464
+ const audioData = this.sliceManager.getAudioDataForTranscription(
465
+ slice.index,
466
+ )
467
+
468
+ if (!audioData) {
469
+ this.log(`No audio data available for slice ${slice.index}`)
470
+ return
471
+ }
472
+
473
+ // Add to transcription queue
474
+ this.transcriptionQueue.unshift({
475
+ sliceIndex: slice.index,
476
+ audioData,
477
+ })
478
+
479
+ this.log(
480
+ `Queued slice ${slice.index} for transcription (${slice.data.length} samples)`,
481
+ )
482
+
483
+ await this.processTranscriptionQueue()
484
+ } catch (error: any) {
485
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
486
+ }
487
+ }
488
+
489
+ /**
490
+ * Detect speech using VAD context
491
+ */
492
+ private async detectSpeech(
493
+ audioData: Uint8Array,
494
+ sliceIndex: number,
495
+ ): Promise<RealtimeVadEvent> {
496
+ if (!this.vadContext) {
497
+ // When no VAD context is available, assume speech is always detected
498
+ // but still follow the state machine pattern
499
+ const currentTimestamp = Date.now()
500
+
501
+ // Assume speech is always detected when no VAD context
502
+ const vadEventType: RealtimeVadEvent['type'] =
503
+ this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue'
504
+
505
+ // Update VAD state
506
+ this.lastVadState = 'speech'
507
+
508
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
509
+ return {
510
+ type: vadEventType,
511
+ lastSpeechDetectedTime: 0,
512
+ timestamp: currentTimestamp,
513
+ confidence: 1.0,
514
+ duration: audioData.length / sampleRate / 2, // Convert bytes to seconds
515
+ sliceIndex,
516
+ }
517
+ }
518
+
519
+ try {
520
+ const audioBuffer = audioData.buffer as SharedArrayBuffer
521
+
522
+ // Use VAD context to detect speech segments
523
+ const vadSegments = await this.vadContext.detectSpeechData(
524
+ audioBuffer,
525
+ this.options.vadOptions,
526
+ )
527
+
528
+ // Calculate confidence based on speech segments
529
+ let confidence = 0.0
530
+ let lastSpeechDetectedTime = 0
531
+ if (vadSegments && vadSegments.length > 0) {
532
+ // If there are speech segments, calculate average confidence
533
+ const totalTime = vadSegments.reduce(
534
+ (sum, segment) => sum + (segment.t1 - segment.t0),
535
+ 0,
536
+ )
537
+ const audioDuration = audioData.length / 16000 / 2 // Convert bytes to seconds
538
+ confidence =
539
+ totalTime > 0 ? Math.min(totalTime / audioDuration, 1.0) : 0.0
540
+ lastSpeechDetectedTime = vadSegments[vadSegments.length - 1]?.t1 || -1
541
+ }
542
+
543
+ const threshold = this.options.vadOptions.threshold || 0.5
544
+ let isSpeech = confidence > threshold
545
+ const currentTimestamp = Date.now()
546
+
547
+ // Determine VAD event type based on current and previous state
548
+ let vadEventType: RealtimeVadEvent['type']
549
+ if (isSpeech) {
550
+ vadEventType =
551
+ this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue'
552
+
553
+ const minDuration = this.options.audioMinSec
554
+ // Check if this is a new speech detection (different from last detected time)
555
+ if (
556
+ lastSpeechDetectedTime === this.lastSpeechDetectedTime ||
557
+ (lastSpeechDetectedTime - this.lastSpeechDetectedTime) / 100 <
558
+ minDuration
559
+ ) {
560
+ if (this.lastVadState === 'silence') vadEventType = 'silence'
561
+ if (this.lastVadState === 'speech') vadEventType = 'speech_end'
562
+ isSpeech = false
563
+ confidence = 0.0
564
+ }
565
+ this.lastSpeechDetectedTime = lastSpeechDetectedTime
566
+ } else {
567
+ vadEventType = this.lastVadState === 'speech' ? 'speech_end' : 'silence'
568
+ }
569
+
570
+ // Update VAD state for next detection
571
+ this.lastVadState = isSpeech ? 'speech' : 'silence'
572
+
573
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
574
+ return {
575
+ type: vadEventType,
576
+ lastSpeechDetectedTime,
577
+ timestamp: currentTimestamp,
578
+ confidence,
579
+ duration: audioData.length / sampleRate / 2, // Convert bytes to seconds
580
+ sliceIndex,
581
+ currentThreshold: threshold,
582
+ }
583
+ } catch (error) {
584
+ this.log(`VAD detection error: ${error}`)
585
+ // Re-throw the error so it can be handled by the caller
586
+ throw error
587
+ }
588
+ }
589
+
590
+ private isProcessingTranscriptionQueue = false
591
+
592
+ /**
593
+ * Process the transcription queue
594
+ */
595
+ private async processTranscriptionQueue(): Promise<void> {
596
+ if (this.isProcessingTranscriptionQueue) return
597
+
598
+ this.isProcessingTranscriptionQueue = true
599
+
600
+ while (this.transcriptionQueue.length > 0) {
601
+ const item = this.transcriptionQueue.shift()
602
+ this.transcriptionQueue = [] // Old items are not needed anymore
603
+ if (item) {
604
+ // eslint-disable-next-line no-await-in-loop
605
+ await this.processTranscription(item).catch((error) => {
606
+ this.handleError(`Transcription error: ${error}`)
607
+ })
608
+ }
609
+ }
610
+
611
+ this.isProcessingTranscriptionQueue = false
612
+ }
613
+
614
+ /**
615
+ * Build prompt from initial prompt and previous slices
616
+ */
617
+ private buildPrompt(currentSliceIndex: number): string | undefined {
618
+ const promptParts: string[] = []
619
+
620
+ // Add initial prompt if provided
621
+ if (this.options.initialPrompt) {
622
+ promptParts.push(this.options.initialPrompt)
623
+ }
624
+
625
+ // Add previous slice results if enabled
626
+ if (this.options.promptPreviousSlices) {
627
+ // Get transcription results from previous slices (up to the current slice)
628
+ const previousResults = Array.from(this.transcriptionResults.entries())
629
+ .filter(([sliceIndex]) => sliceIndex < currentSliceIndex)
630
+ .sort(([a], [b]) => a - b) // Sort by slice index
631
+ .map(([, result]) => result.transcribeEvent.data?.result)
632
+ .filter((result): result is string => Boolean(result)) // Filter out empty results with type guard
633
+
634
+ if (previousResults.length > 0) {
635
+ promptParts.push(...previousResults)
636
+ }
637
+ }
638
+
639
+ return promptParts.join(' ') || undefined
640
+ }
641
+
642
+ /**
643
+ * Process a single transcription
644
+ */
645
+ private async processTranscription(item: {
646
+ sliceIndex: number
647
+ audioData: Uint8Array
648
+ }): Promise<void> {
649
+ if (!this.isActive) {
650
+ return
651
+ }
652
+
653
+ this.isTranscribing = true
654
+
655
+ // Emit stats update for status change
656
+ this.emitStatsUpdate('status_change')
657
+
658
+ const startTime = Date.now()
659
+
660
+ try {
661
+ // Build prompt from initial prompt and previous slices
662
+ const prompt = this.buildPrompt(item.sliceIndex)
663
+
664
+ const audioBuffer = item.audioData.buffer as SharedArrayBuffer
665
+ const { promise } = this.whisperContext.transcribeData(audioBuffer, {
666
+ ...this.options.transcribeOptions,
667
+ prompt, // Include the constructed prompt
668
+ onProgress: undefined, // Disable progress for realtime
669
+ })
670
+
671
+ const result = await promise
672
+ const endTime = Date.now()
673
+
674
+ // Create transcribe event
675
+ const { sampleRate = 16000 } = this.options.audioStreamConfig || {}
676
+ const transcribeEvent: RealtimeTranscribeEvent = {
677
+ type: 'transcribe',
678
+ sliceIndex: item.sliceIndex,
679
+ data: result,
680
+ isCapturing: this.audioStream.isRecording(),
681
+ processTime: endTime - startTime,
682
+ recordingTime: item.audioData.length / (sampleRate / 1000) / 2, // ms,
683
+ memoryUsage: this.sliceManager.getMemoryUsage(),
684
+ }
685
+
686
+ // Emit transcribe event
687
+ this.callbacks.onTranscribe?.(transcribeEvent)
688
+
689
+ // Save transcription results
690
+ const slice = this.sliceManager.getSliceByIndex(item.sliceIndex)
691
+ if (slice) {
692
+ this.transcriptionResults.set(item.sliceIndex, {
693
+ slice: {
694
+ // Don't keep data in the slice
695
+ index: slice.index,
696
+ sampleCount: slice.sampleCount,
697
+ startTime: slice.startTime,
698
+ endTime: slice.endTime,
699
+ isProcessed: slice.isProcessed,
700
+ isReleased: slice.isReleased,
701
+ },
702
+ transcribeEvent,
703
+ })
704
+ }
705
+
706
+ // Emit stats update for memory/slice changes
707
+ this.emitStatsUpdate('memory_change')
708
+
709
+ this.log(
710
+ `Transcribed speech segment ${item.sliceIndex}: "${result.result}"`,
711
+ )
712
+ } catch (error) {
713
+ // Emit error event to transcribe callback
714
+ const errorEvent: RealtimeTranscribeEvent = {
715
+ type: 'error',
716
+ sliceIndex: item.sliceIndex,
717
+ data: undefined,
718
+ isCapturing: this.audioStream.isRecording(),
719
+ processTime: Date.now() - startTime,
720
+ recordingTime: 0,
721
+ memoryUsage: this.sliceManager.getMemoryUsage(),
722
+ }
723
+
724
+ this.callbacks.onTranscribe?.(errorEvent)
725
+
726
+ this.handleError(
727
+ `Transcription failed for speech segment ${item.sliceIndex}: ${error}`,
728
+ )
729
+ } finally {
730
+ // Check if we should continue processing queue
731
+ if (this.transcriptionQueue.length > 0) {
732
+ await this.processTranscriptionQueue()
733
+ } else {
734
+ this.isTranscribing = false
735
+ }
736
+ }
737
+ }
738
+
739
+ /**
740
+ * Handle audio status changes
741
+ */
742
+ private handleAudioStatusChange(isRecording: boolean): void {
743
+ this.log(`Audio recording: ${isRecording ? 'started' : 'stopped'}`)
744
+ }
745
+
746
+ /**
747
+ * Handle errors from components
748
+ */
749
+ private handleError(error: string): void {
750
+ this.log(`Error: ${error}`)
751
+ this.callbacks.onError?.(error)
752
+ }
753
+
754
+ /**
755
+ * Update callbacks
756
+ */
757
+ updateCallbacks(callbacks: Partial<RealtimeTranscriberCallbacks>): void {
758
+ this.callbacks = { ...this.callbacks, ...callbacks }
759
+ }
760
+
761
+ /**
762
+ * Update VAD options dynamically
763
+ */
764
+ updateVadOptions(options: Partial<VadOptions>): void {
765
+ this.options.vadOptions = { ...this.options.vadOptions, ...options }
766
+ }
767
+
768
+ /**
769
+ * Update auto-slice options dynamically
770
+ */
771
+ updateAutoSliceOptions(options: {
772
+ autoSliceOnSpeechEnd?: boolean
773
+ autoSliceThreshold?: number
774
+ }): void {
775
+ if (options.autoSliceOnSpeechEnd !== undefined) {
776
+ this.options.autoSliceOnSpeechEnd = options.autoSliceOnSpeechEnd
777
+ }
778
+ if (options.autoSliceThreshold !== undefined) {
779
+ this.options.autoSliceThreshold = options.autoSliceThreshold
780
+ }
781
+ this.log(
782
+ `Auto-slice options updated: enabled=${this.options.autoSliceOnSpeechEnd}, threshold=${this.options.autoSliceThreshold}`,
783
+ )
784
+ }
785
+
786
+ /**
787
+ * Get current statistics
788
+ */
789
+ getStatistics() {
790
+ return {
791
+ isActive: this.isActive,
792
+ isTranscribing: this.isTranscribing,
793
+ vadEnabled: this.vadEnabled,
794
+ audioStats: {
795
+ isRecording: this.audioStream.isRecording(),
796
+ accumulatedSamples: this.accumulatedData.length,
797
+ },
798
+ vadStats: this.vadEnabled
799
+ ? {
800
+ enabled: true,
801
+ contextAvailable: !!this.vadContext,
802
+ lastSpeechDetectedTime: this.lastSpeechDetectedTime,
803
+ }
804
+ : null,
805
+ sliceStats: this.sliceManager.getCurrentSliceInfo(),
806
+ autoSliceConfig: {
807
+ enabled: this.options.autoSliceOnSpeechEnd,
808
+ threshold: this.options.autoSliceThreshold,
809
+ targetDuration: this.options.audioSliceSec,
810
+ minDuration: this.options.audioMinSec,
811
+ },
812
+ }
813
+ }
814
+
815
+ /**
816
+ * Get all transcription results
817
+ */
818
+ getTranscriptionResults(): Array<{
819
+ slice: AudioSliceNoData
820
+ transcribeEvent: RealtimeTranscribeEvent
821
+ }> {
822
+ return Array.from(this.transcriptionResults.values())
823
+ }
824
+
825
+ /**
826
+ * Force move to the next slice, finalizing the current one regardless of capacity
827
+ */
828
+ async nextSlice(): Promise<void> {
829
+ if (!this.isActive) {
830
+ this.log('Cannot force next slice - transcriber is not active')
831
+ return
832
+ }
833
+
834
+ // Emit start event to indicate slice processing has started
835
+ const startEvent: RealtimeTranscribeEvent = {
836
+ type: 'start',
837
+ sliceIndex: -1, // Use -1 to indicate forced slice
838
+ data: undefined,
839
+ isCapturing: this.audioStream.isRecording(),
840
+ processTime: 0,
841
+ recordingTime: 0,
842
+ memoryUsage: this.sliceManager.getMemoryUsage(),
843
+ }
844
+
845
+ this.callbacks.onTranscribe?.(startEvent)
846
+
847
+ // Check if there are pending transcriptions or currently transcribing
848
+ if (this.isTranscribing || this.transcriptionQueue.length > 0) {
849
+ this.log(
850
+ 'Waiting for pending transcriptions to complete before forcing next slice...',
851
+ )
852
+
853
+ // Wait for current transcription queue to be processed
854
+ await this.processTranscriptionQueue()
855
+ }
856
+
857
+ const result = this.sliceManager.forceNextSlice()
858
+
859
+ if (result.slice) {
860
+ this.log(
861
+ `Forced slice ${result.slice.index} ready (${result.slice.data.length} bytes)`,
862
+ )
863
+
864
+ // Process VAD for the slice if enabled
865
+ if (!this.isTranscribing && this.vadEnabled) {
866
+ this.processSliceVAD(result.slice).catch((error: any) => {
867
+ this.handleError(`VAD processing error: ${error}`)
868
+ })
869
+ } else if (!this.isTranscribing) {
870
+ // If VAD is disabled, transcribe slices as they become ready
871
+ this.queueSliceForTranscription(result.slice).catch((error: any) => {
872
+ this.handleError(`Failed to queue slice for transcription: ${error}`)
873
+ })
874
+ } else {
875
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`)
876
+ }
877
+
878
+ this.emitStatsUpdate('memory_change')
879
+ } else {
880
+ this.log('Forced next slice but no slice data to process')
881
+ }
882
+ }
883
+
884
+ /**
885
+ * Reset all components
886
+ */
887
+ reset(): void {
888
+ this.sliceManager.reset()
889
+ this.transcriptionQueue = []
890
+ this.isTranscribing = false
891
+ this.accumulatedData = new Uint8Array(0)
892
+
893
+ // Reset simplified VAD state
894
+ this.lastSpeechDetectedTime = -1
895
+ this.lastVadState = 'silence'
896
+
897
+ // Reset stats snapshot for clean start
898
+ this.lastStatsSnapshot = null
899
+
900
+ // Cancel WAV file writing if in progress
901
+ if (this.wavFileWriter) {
902
+ this.wavFileWriter.cancel().catch((error) => {
903
+ this.log(`Failed to cancel WAV file writing: ${error}`)
904
+ })
905
+ this.wavFileWriter = null
906
+ }
907
+
908
+ // Clear transcription results
909
+ this.transcriptionResults.clear()
910
+ }
911
+
912
+ /**
913
+ * Release all resources
914
+ */
915
+ async release(): Promise<void> {
916
+ if (this.isActive) {
917
+ await this.stop()
918
+ }
919
+
920
+ await this.audioStream.release()
921
+ await this.wavFileWriter?.finalize()
922
+ this.vadContext = undefined
923
+ }
924
+
925
+ /**
926
+ * Emit stats update event if stats have changed significantly
927
+ */
928
+ private emitStatsUpdate(eventType: RealtimeStatsEvent['type']): void {
929
+ const currentStats = this.getStatistics()
930
+
931
+ // Check if stats have changed significantly
932
+ if (
933
+ !this.lastStatsSnapshot ||
934
+ RealtimeTranscriber.shouldEmitStatsUpdate(
935
+ currentStats,
936
+ this.lastStatsSnapshot,
937
+ )
938
+ ) {
939
+ const statsEvent: RealtimeStatsEvent = {
940
+ timestamp: Date.now(),
941
+ type: eventType,
942
+ data: currentStats,
943
+ }
944
+
945
+ this.callbacks.onStatsUpdate?.(statsEvent)
946
+ this.lastStatsSnapshot = { ...currentStats }
947
+ }
948
+ }
949
+
950
+ /**
951
+ * Determine if stats update should be emitted
952
+ */
953
+ private static shouldEmitStatsUpdate(current: any, previous: any): boolean {
954
+ // Always emit on status changes
955
+ if (
956
+ current.isActive !== previous.isActive ||
957
+ current.isTranscribing !== previous.isTranscribing
958
+ ) {
959
+ return true
960
+ }
961
+
962
+ // Emit on significant memory changes (>10% or >5MB)
963
+ const currentMemory = current.sliceStats?.memoryUsage?.estimatedMB || 0
964
+ const previousMemory = previous.sliceStats?.memoryUsage?.estimatedMB || 0
965
+ const memoryDiff = Math.abs(currentMemory - previousMemory)
966
+
967
+ if (
968
+ memoryDiff > 5 ||
969
+ (previousMemory > 0 && memoryDiff / previousMemory > 0.1)
970
+ ) {
971
+ return true
972
+ }
973
+
974
+ return false
975
+ }
976
+
977
+ /**
978
+ * Logger function
979
+ */
980
+ private log(message: string): void {
981
+ this.options.logger(`[RealtimeTranscriber] ${message}`)
982
+ }
983
+ }