whisper.rn 0.5.0-rc.1 → 0.5.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +119 -50
  2. package/android/src/main/java/com/rnwhisper/RNWhisper.java +26 -0
  3. package/android/src/main/java/com/rnwhisper/WhisperContext.java +25 -0
  4. package/android/src/main/jni.cpp +81 -0
  5. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  6. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  7. package/ios/RNWhisper.mm +11 -0
  8. package/ios/RNWhisperContext.h +1 -0
  9. package/ios/RNWhisperContext.mm +46 -0
  10. package/lib/commonjs/AudioSessionIos.js +2 -1
  11. package/lib/commonjs/AudioSessionIos.js.map +1 -1
  12. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  13. package/lib/commonjs/index.js +26 -0
  14. package/lib/commonjs/index.js.map +1 -1
  15. package/lib/commonjs/jest-mock.js +126 -0
  16. package/lib/commonjs/jest-mock.js.map +1 -0
  17. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +831 -0
  18. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  19. package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
  20. package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
  21. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
  22. package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  23. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
  24. package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  25. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
  26. package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  27. package/lib/commonjs/realtime-transcription/index.js +27 -0
  28. package/lib/commonjs/realtime-transcription/index.js.map +1 -0
  29. package/lib/commonjs/realtime-transcription/types.js +114 -0
  30. package/lib/commonjs/realtime-transcription/types.js.map +1 -0
  31. package/lib/commonjs/utils/WavFileReader.js +158 -0
  32. package/lib/commonjs/utils/WavFileReader.js.map +1 -0
  33. package/lib/commonjs/utils/WavFileWriter.js +181 -0
  34. package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
  35. package/lib/commonjs/utils/common.js +25 -0
  36. package/lib/commonjs/utils/common.js.map +1 -0
  37. package/lib/module/AudioSessionIos.js +2 -1
  38. package/lib/module/AudioSessionIos.js.map +1 -1
  39. package/lib/module/NativeRNWhisper.js.map +1 -1
  40. package/lib/module/index.js +24 -0
  41. package/lib/module/index.js.map +1 -1
  42. package/lib/module/jest-mock.js +124 -0
  43. package/lib/module/jest-mock.js.map +1 -0
  44. package/lib/module/realtime-transcription/RealtimeTranscriber.js +825 -0
  45. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
  46. package/lib/module/realtime-transcription/SliceManager.js +226 -0
  47. package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
  48. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
  49. package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
  50. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
  51. package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
  52. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
  53. package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
  54. package/lib/module/realtime-transcription/index.js +8 -0
  55. package/lib/module/realtime-transcription/index.js.map +1 -0
  56. package/lib/module/realtime-transcription/types.js +107 -0
  57. package/lib/module/realtime-transcription/types.js.map +1 -0
  58. package/lib/module/utils/WavFileReader.js +151 -0
  59. package/lib/module/utils/WavFileReader.js.map +1 -0
  60. package/lib/module/utils/WavFileWriter.js +174 -0
  61. package/lib/module/utils/WavFileWriter.js.map +1 -0
  62. package/lib/module/utils/common.js +18 -0
  63. package/lib/module/utils/common.js.map +1 -0
  64. package/lib/typescript/AudioSessionIos.d.ts +1 -1
  65. package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
  66. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  67. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  68. package/lib/typescript/index.d.ts +4 -0
  69. package/lib/typescript/index.d.ts.map +1 -1
  70. package/lib/typescript/jest-mock.d.ts +2 -0
  71. package/lib/typescript/jest-mock.d.ts.map +1 -0
  72. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +165 -0
  73. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
  74. package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
  75. package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
  76. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
  77. package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
  78. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
  79. package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
  80. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
  81. package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
  82. package/lib/typescript/realtime-transcription/index.d.ts +6 -0
  83. package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
  84. package/lib/typescript/realtime-transcription/types.d.ts +216 -0
  85. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
  86. package/lib/typescript/utils/WavFileReader.d.ts +61 -0
  87. package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
  88. package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
  89. package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
  90. package/lib/typescript/utils/common.d.ts +9 -0
  91. package/lib/typescript/utils/common.d.ts.map +1 -0
  92. package/package.json +18 -6
  93. package/src/AudioSessionIos.ts +3 -2
  94. package/src/NativeRNWhisper.ts +2 -0
  95. package/src/index.ts +34 -0
  96. package/{jest/mock.js → src/jest-mock.ts} +2 -2
  97. package/src/realtime-transcription/RealtimeTranscriber.ts +983 -0
  98. package/src/realtime-transcription/SliceManager.ts +252 -0
  99. package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
  100. package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
  101. package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
  102. package/src/realtime-transcription/index.ts +34 -0
  103. package/src/realtime-transcription/types.ts +277 -0
  104. package/src/utils/WavFileReader.ts +202 -0
  105. package/src/utils/WavFileWriter.ts +206 -0
  106. package/src/utils/common.ts +17 -0
@@ -0,0 +1,825 @@
1
+ /* eslint-disable class-methods-use-this */
2
+
3
+ import { SliceManager } from './SliceManager';
4
+ import { WavFileWriter } from '../utils/WavFileWriter';
5
+ import { VAD_PRESETS } from './types';
6
+
7
+ /**
8
+ * RealtimeTranscriber provides real-time audio transcription with VAD support.
9
+ *
10
+ * Features:
11
+ * - Automatic slice management based on duration
12
+ * - VAD-based speech detection and auto-slicing
13
+ * - Configurable auto-slice mechanism that triggers on speech_end/silence events
14
+ * - Memory management for audio slices
15
+ * - Queue-based transcription processing
16
+ */
17
+ export class RealtimeTranscriber {
18
+ callbacks = {};
19
+ isActive = false;
20
+ isTranscribing = false;
21
+ vadEnabled = false;
22
+ transcriptionQueue = [];
23
+ accumulatedData = new Uint8Array(0);
24
+ wavFileWriter = null;
25
+
26
+ // Simplified VAD state management
27
+ lastSpeechDetectedTime = 0;
28
+
29
+ // Track VAD state for proper event transitions
30
+ lastVadState = 'silence';
31
+
32
+ // Track last stats to emit only when changed
33
+ lastStatsSnapshot = null;
34
+
35
+ // Store transcription results by slice index
36
+ transcriptionResults = new Map();
37
+ constructor(dependencies) {
38
+ let options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
39
+ let callbacks = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
40
+ this.whisperContext = dependencies.whisperContext;
41
+ this.vadContext = dependencies.vadContext;
42
+ this.audioStream = dependencies.audioStream;
43
+ this.fs = dependencies.fs;
44
+ this.callbacks = callbacks;
45
+
46
+ // Set default options with proper types
47
+ this.options = {
48
+ audioSliceSec: options.audioSliceSec || 30,
49
+ audioMinSec: options.audioMinSec || 1,
50
+ maxSlicesInMemory: options.maxSlicesInMemory || 3,
51
+ vadOptions: options.vadOptions || VAD_PRESETS.default,
52
+ vadPreset: options.vadPreset,
53
+ autoSliceOnSpeechEnd: options.autoSliceOnSpeechEnd || true,
54
+ autoSliceThreshold: options.autoSliceThreshold || 0.5,
55
+ transcribeOptions: options.transcribeOptions || {},
56
+ initialPrompt: options.initialPrompt,
57
+ promptPreviousSlices: options.promptPreviousSlices ?? true,
58
+ audioOutputPath: options.audioOutputPath,
59
+ logger: options.logger || (() => {})
60
+ };
61
+
62
+ // Apply VAD preset if specified
63
+ if (this.options.vadPreset && VAD_PRESETS[this.options.vadPreset]) {
64
+ this.options.vadOptions = {
65
+ ...VAD_PRESETS[this.options.vadPreset],
66
+ ...this.options.vadOptions
67
+ };
68
+ }
69
+
70
+ // Enable VAD if context is provided and not explicitly disabled
71
+ this.vadEnabled = !!this.vadContext;
72
+
73
+ // Initialize managers
74
+ this.sliceManager = new SliceManager(this.options.audioSliceSec, this.options.maxSlicesInMemory);
75
+
76
+ // Set up audio stream callbacks
77
+ this.audioStream.onData(this.handleAudioData.bind(this));
78
+ this.audioStream.onError(this.handleError.bind(this));
79
+ this.audioStream.onStatusChange(this.handleAudioStatusChange.bind(this));
80
+ }
81
+
82
+ /**
83
+ * Start realtime transcription
84
+ */
85
+ async start() {
86
+ if (this.isActive) {
87
+ throw new Error('Realtime transcription is already active');
88
+ }
89
+ try {
90
+ var _this$callbacks$onSta, _this$callbacks, _this$options$audioSt4, _this$options$audioSt5, _this$options$audioSt6, _this$options$audioSt7, _this$options$audioSt8;
91
+ this.isActive = true;
92
+ (_this$callbacks$onSta = (_this$callbacks = this.callbacks).onStatusChange) === null || _this$callbacks$onSta === void 0 ? void 0 : _this$callbacks$onSta.call(_this$callbacks, true);
93
+
94
+ // Reset all state to ensure clean start
95
+ this.reset();
96
+
97
+ // Initialize WAV file writer if output path is specified
98
+ if (this.fs && this.options.audioOutputPath) {
99
+ var _this$options$audioSt, _this$options$audioSt2, _this$options$audioSt3;
100
+ this.wavFileWriter = new WavFileWriter(this.fs, this.options.audioOutputPath, {
101
+ sampleRate: ((_this$options$audioSt = this.options.audioStreamConfig) === null || _this$options$audioSt === void 0 ? void 0 : _this$options$audioSt.sampleRate) || 16000,
102
+ channels: ((_this$options$audioSt2 = this.options.audioStreamConfig) === null || _this$options$audioSt2 === void 0 ? void 0 : _this$options$audioSt2.channels) || 1,
103
+ bitsPerSample: ((_this$options$audioSt3 = this.options.audioStreamConfig) === null || _this$options$audioSt3 === void 0 ? void 0 : _this$options$audioSt3.bitsPerSample) || 16
104
+ });
105
+ await this.wavFileWriter.initialize();
106
+ }
107
+
108
+ // Start audio recording
109
+ await this.audioStream.initialize({
110
+ sampleRate: ((_this$options$audioSt4 = this.options.audioStreamConfig) === null || _this$options$audioSt4 === void 0 ? void 0 : _this$options$audioSt4.sampleRate) || 16000,
111
+ channels: ((_this$options$audioSt5 = this.options.audioStreamConfig) === null || _this$options$audioSt5 === void 0 ? void 0 : _this$options$audioSt5.channels) || 1,
112
+ bitsPerSample: ((_this$options$audioSt6 = this.options.audioStreamConfig) === null || _this$options$audioSt6 === void 0 ? void 0 : _this$options$audioSt6.bitsPerSample) || 16,
113
+ audioSource: ((_this$options$audioSt7 = this.options.audioStreamConfig) === null || _this$options$audioSt7 === void 0 ? void 0 : _this$options$audioSt7.audioSource) || 6,
114
+ bufferSize: ((_this$options$audioSt8 = this.options.audioStreamConfig) === null || _this$options$audioSt8 === void 0 ? void 0 : _this$options$audioSt8.bufferSize) || 16 * 1024
115
+ });
116
+ await this.audioStream.start();
117
+
118
+ // Emit stats update for status change
119
+ this.emitStatsUpdate('status_change');
120
+ this.log('Realtime transcription started');
121
+ } catch (error) {
122
+ var _this$callbacks$onSta2, _this$callbacks2;
123
+ this.isActive = false;
124
+ (_this$callbacks$onSta2 = (_this$callbacks2 = this.callbacks).onStatusChange) === null || _this$callbacks$onSta2 === void 0 ? void 0 : _this$callbacks$onSta2.call(_this$callbacks2, false);
125
+ throw error;
126
+ }
127
+ }
128
+
129
+ /**
130
+ * Stop realtime transcription
131
+ */
132
+ async stop() {
133
+ if (!this.isActive) {
134
+ return;
135
+ }
136
+ try {
137
+ var _this$callbacks$onSta3, _this$callbacks3;
138
+ this.isActive = false;
139
+
140
+ // Stop audio recording
141
+ await this.audioStream.stop();
142
+
143
+ // Process any remaining accumulated data
144
+ if (this.accumulatedData.length > 0) {
145
+ this.processAccumulatedDataForSliceManagement();
146
+ }
147
+
148
+ // Process any remaining queued transcriptions
149
+ await this.processTranscriptionQueue();
150
+
151
+ // Finalize WAV file
152
+ if (this.wavFileWriter) {
153
+ await this.wavFileWriter.finalize();
154
+ this.wavFileWriter = null;
155
+ }
156
+
157
+ // Reset all state completely
158
+ this.reset();
159
+ (_this$callbacks$onSta3 = (_this$callbacks3 = this.callbacks).onStatusChange) === null || _this$callbacks$onSta3 === void 0 ? void 0 : _this$callbacks$onSta3.call(_this$callbacks3, false);
160
+
161
+ // Emit stats update for status change
162
+ this.emitStatsUpdate('status_change');
163
+ this.log('Realtime transcription stopped');
164
+ } catch (error) {
165
+ this.handleError(`Stop error: ${error}`);
166
+ }
167
+ }
168
+
169
+ /**
170
+ * Handle incoming audio data from audio stream
171
+ */
172
+ handleAudioData(streamData) {
173
+ if (!this.isActive) {
174
+ return;
175
+ }
176
+ try {
177
+ // Write to WAV file if enabled (convert to Uint8Array for WavFileWriter)
178
+ if (this.wavFileWriter) {
179
+ this.wavFileWriter.appendAudioData(streamData.data).catch(error => {
180
+ this.log(`Failed to write audio to WAV file: ${error}`);
181
+ });
182
+ }
183
+
184
+ // Always accumulate data for slice management
185
+ this.accumulateAudioData(streamData.data);
186
+ } catch (error) {
187
+ const errorMessage = error instanceof Error ? error.message : 'Audio processing error';
188
+ this.handleError(errorMessage);
189
+ }
190
+ }
191
+
192
+ /**
193
+ * Accumulate audio data for slice management
194
+ */
195
+ accumulateAudioData(newData) {
196
+ const combined = new Uint8Array(this.accumulatedData.length + newData.length);
197
+ combined.set(this.accumulatedData);
198
+ combined.set(new Uint8Array(newData), this.accumulatedData.length);
199
+ this.accumulatedData = combined;
200
+
201
+ // Process accumulated data when we have enough for slice management
202
+ const minBufferSamples = 16000 * 1; // 1 second for slice management
203
+ if (this.accumulatedData.length >= minBufferSamples) {
204
+ this.processAccumulatedDataForSliceManagement();
205
+ }
206
+ }
207
+
208
+ /**
209
+ * Process accumulated audio data through SliceManager
210
+ */
211
+ processAccumulatedDataForSliceManagement() {
212
+ if (this.accumulatedData.length === 0) {
213
+ return;
214
+ }
215
+
216
+ // Process through slice manager directly with Uint8Array
217
+ const result = this.sliceManager.addAudioData(this.accumulatedData);
218
+ if (result.slice) {
219
+ this.log(`Slice ${result.slice.index} ready (${result.slice.data.length} bytes)`);
220
+
221
+ // Process VAD for the slice if enabled
222
+ if (!this.isTranscribing && this.vadEnabled) {
223
+ this.processSliceVAD(result.slice).catch(error => {
224
+ this.handleError(`VAD processing error: ${error}`);
225
+ });
226
+ } else if (!this.isTranscribing) {
227
+ // If VAD is disabled, transcribe slices as they become ready
228
+ this.queueSliceForTranscription(result.slice).catch(error => {
229
+ this.handleError(`Failed to queue slice for transcription: ${error}`);
230
+ });
231
+ } else {
232
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`);
233
+ }
234
+ this.emitStatsUpdate('memory_change');
235
+ }
236
+
237
+ // Clear accumulated data
238
+ this.accumulatedData = new Uint8Array(0);
239
+ }
240
+
241
+ /**
242
+ * Check if auto-slice should be triggered based on VAD event and timing
243
+ */
244
+ async checkAutoSlice(vadEvent, _slice) {
245
+ if (!this.options.autoSliceOnSpeechEnd || !this.vadEnabled) {
246
+ return;
247
+ }
248
+
249
+ // Only trigger on speech_end or silence events
250
+ const shouldTriggerAutoSlice = vadEvent.type === 'speech_end' || vadEvent.type === 'silence';
251
+ if (!shouldTriggerAutoSlice) {
252
+ return;
253
+ }
254
+
255
+ // Get current slice info from SliceManager
256
+ const currentSliceInfo = this.sliceManager.getCurrentSliceInfo();
257
+ const currentSlice = this.sliceManager.getSliceByIndex(currentSliceInfo.currentSliceIndex);
258
+ if (!currentSlice) {
259
+ return;
260
+ }
261
+
262
+ // Calculate current slice duration
263
+ const currentDuration = (Date.now() - currentSlice.startTime) / 1000; // Convert to seconds
264
+ const targetDuration = this.options.audioSliceSec;
265
+ const minDuration = this.options.audioMinSec;
266
+ const autoSliceThreshold = targetDuration * this.options.autoSliceThreshold;
267
+
268
+ // Check if conditions are met for auto-slice
269
+ const meetsMinDuration = currentDuration >= minDuration;
270
+ const meetsThreshold = currentDuration >= autoSliceThreshold;
271
+ if (meetsMinDuration && meetsThreshold) {
272
+ this.log(`Auto-slicing on ${vadEvent.type} at ${currentDuration.toFixed(1)}s ` + `(min: ${minDuration}s, threshold: ${autoSliceThreshold.toFixed(1)}s, target: ${targetDuration}s)`);
273
+
274
+ // Force next slice
275
+ await this.nextSlice();
276
+ } else {
277
+ this.log(`Auto-slice conditions not met on ${vadEvent.type}: ` + `duration=${currentDuration.toFixed(1)}s, min=${minDuration}s, threshold=${autoSliceThreshold.toFixed(1)}s ` + `(minOk=${meetsMinDuration}, thresholdOk=${meetsThreshold})`);
278
+ }
279
+ }
280
+
281
+ /**
282
+ * Process VAD for a completed slice
283
+ */
284
+ async processSliceVAD(slice) {
285
+ try {
286
+ var _this$callbacks$onVad, _this$callbacks4;
287
+ // Get audio data from the slice for VAD processing
288
+ const audioData = this.sliceManager.getAudioDataForTranscription(slice.index);
289
+ if (!audioData) {
290
+ this.log(`No audio data available for VAD processing of slice ${slice.index}`);
291
+ return;
292
+ }
293
+
294
+ // Convert base64 back to Uint8Array for VAD processing
295
+
296
+ // Detect speech in the slice
297
+ const vadEvent = await this.detectSpeech(audioData, slice.index);
298
+ vadEvent.timestamp = Date.now();
299
+
300
+ // Emit VAD event
301
+ (_this$callbacks$onVad = (_this$callbacks4 = this.callbacks).onVad) === null || _this$callbacks$onVad === void 0 ? void 0 : _this$callbacks$onVad.call(_this$callbacks4, vadEvent);
302
+
303
+ // Check if auto-slice should be triggered
304
+ await this.checkAutoSlice(vadEvent, slice);
305
+
306
+ // Check if speech was detected and if we should transcribe
307
+ const isSpeech = vadEvent.type === 'speech_start' || vadEvent.type === 'speech_continue';
308
+ const isSpeechEnd = vadEvent.type === 'speech_end';
309
+ if (isSpeech) {
310
+ const minDuration = this.options.audioMinSec;
311
+ // Check minimum duration requirement
312
+ const speechDuration = slice.data.length / 16000 / 2; // Convert bytes to seconds (16kHz, 16-bit)
313
+
314
+ if (speechDuration >= minDuration) {
315
+ this.log(`Speech detected in slice ${slice.index}, queueing for transcription`);
316
+ await this.queueSliceForTranscription(slice);
317
+ } else {
318
+ this.log(`Speech too short in slice ${slice.index} (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`);
319
+ }
320
+ } else if (isSpeechEnd) {
321
+ this.log(`Speech ended in slice ${slice.index}`);
322
+ // For speech_end events, we might want to queue the slice for transcription
323
+ // to capture the final part of the speech segment
324
+ const speechDuration = slice.data.length / 16000 / 2; // Convert bytes to seconds
325
+ const minDuration = this.options.audioMinSec;
326
+ if (speechDuration >= minDuration) {
327
+ this.log(`Speech end detected in slice ${slice.index}, queueing final segment for transcription`);
328
+ await this.queueSliceForTranscription(slice);
329
+ } else {
330
+ this.log(`Speech end segment too short in slice ${slice.index} (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`);
331
+ }
332
+ } else {
333
+ this.log(`No speech detected in slice ${slice.index}`);
334
+ }
335
+
336
+ // Emit stats update for VAD change
337
+ this.emitStatsUpdate('vad_change');
338
+ } catch (error) {
339
+ this.handleError(`VAD processing error for slice ${slice.index}: ${error}`);
340
+ }
341
+ }
342
+
343
+ /**
344
+ * Queue a slice for transcription
345
+ */
346
+ async queueSliceForTranscription(slice) {
347
+ try {
348
+ // Get audio data from the slice
349
+ const audioData = this.sliceManager.getAudioDataForTranscription(slice.index);
350
+ if (!audioData) {
351
+ this.log(`No audio data available for slice ${slice.index}`);
352
+ return;
353
+ }
354
+
355
+ // Add to transcription queue
356
+ this.transcriptionQueue.unshift({
357
+ sliceIndex: slice.index,
358
+ audioData
359
+ });
360
+ this.log(`Queued slice ${slice.index} for transcription (${slice.data.length} samples)`);
361
+ await this.processTranscriptionQueue();
362
+ } catch (error) {
363
+ this.handleError(`Failed to queue slice for transcription: ${error}`);
364
+ }
365
+ }
366
+
367
+ /**
368
+ * Detect speech using VAD context
369
+ */
370
+ async detectSpeech(audioData, sliceIndex) {
371
+ if (!this.vadContext) {
372
+ // When no VAD context is available, assume speech is always detected
373
+ // but still follow the state machine pattern
374
+ const currentTimestamp = Date.now();
375
+
376
+ // Assume speech is always detected when no VAD context
377
+ const vadEventType = this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue';
378
+
379
+ // Update VAD state
380
+ this.lastVadState = 'speech';
381
+ const {
382
+ sampleRate = 16000
383
+ } = this.options.audioStreamConfig || {};
384
+ return {
385
+ type: vadEventType,
386
+ lastSpeechDetectedTime: 0,
387
+ timestamp: currentTimestamp,
388
+ confidence: 1.0,
389
+ duration: audioData.length / sampleRate / 2,
390
+ // Convert bytes to seconds
391
+ sliceIndex
392
+ };
393
+ }
394
+ try {
395
+ const audioBuffer = audioData.buffer;
396
+
397
+ // Use VAD context to detect speech segments
398
+ const vadSegments = await this.vadContext.detectSpeechData(audioBuffer, this.options.vadOptions);
399
+
400
+ // Calculate confidence based on speech segments
401
+ let confidence = 0.0;
402
+ let lastSpeechDetectedTime = 0;
403
+ if (vadSegments && vadSegments.length > 0) {
404
+ var _vadSegments;
405
+ // If there are speech segments, calculate average confidence
406
+ const totalTime = vadSegments.reduce((sum, segment) => sum + (segment.t1 - segment.t0), 0);
407
+ const audioDuration = audioData.length / 16000 / 2; // Convert bytes to seconds
408
+ confidence = totalTime > 0 ? Math.min(totalTime / audioDuration, 1.0) : 0.0;
409
+ lastSpeechDetectedTime = ((_vadSegments = vadSegments[vadSegments.length - 1]) === null || _vadSegments === void 0 ? void 0 : _vadSegments.t1) || -1;
410
+ }
411
+ const threshold = this.options.vadOptions.threshold || 0.5;
412
+ let isSpeech = confidence > threshold;
413
+ const currentTimestamp = Date.now();
414
+
415
+ // Determine VAD event type based on current and previous state
416
+ let vadEventType;
417
+ if (isSpeech) {
418
+ vadEventType = this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue';
419
+ const minDuration = this.options.audioMinSec;
420
+ // Check if this is a new speech detection (different from last detected time)
421
+ if (lastSpeechDetectedTime === this.lastSpeechDetectedTime || (lastSpeechDetectedTime - this.lastSpeechDetectedTime) / 100 < minDuration) {
422
+ if (this.lastVadState === 'silence') vadEventType = 'silence';
423
+ if (this.lastVadState === 'speech') vadEventType = 'speech_end';
424
+ isSpeech = false;
425
+ confidence = 0.0;
426
+ }
427
+ this.lastSpeechDetectedTime = lastSpeechDetectedTime;
428
+ } else {
429
+ vadEventType = this.lastVadState === 'speech' ? 'speech_end' : 'silence';
430
+ }
431
+
432
+ // Update VAD state for next detection
433
+ this.lastVadState = isSpeech ? 'speech' : 'silence';
434
+ const {
435
+ sampleRate = 16000
436
+ } = this.options.audioStreamConfig || {};
437
+ return {
438
+ type: vadEventType,
439
+ lastSpeechDetectedTime,
440
+ timestamp: currentTimestamp,
441
+ confidence,
442
+ duration: audioData.length / sampleRate / 2,
443
+ // Convert bytes to seconds
444
+ sliceIndex,
445
+ currentThreshold: threshold
446
+ };
447
+ } catch (error) {
448
+ this.log(`VAD detection error: ${error}`);
449
+ // Re-throw the error so it can be handled by the caller
450
+ throw error;
451
+ }
452
+ }
453
+ isProcessingTranscriptionQueue = false;
454
+
455
+ /**
456
+ * Process the transcription queue
457
+ */
458
+ async processTranscriptionQueue() {
459
+ if (this.isProcessingTranscriptionQueue) return;
460
+ this.isProcessingTranscriptionQueue = true;
461
+ while (this.transcriptionQueue.length > 0) {
462
+ const item = this.transcriptionQueue.shift();
463
+ this.transcriptionQueue = []; // Old items are not needed anymore
464
+ if (item) {
465
+ // eslint-disable-next-line no-await-in-loop
466
+ await this.processTranscription(item).catch(error => {
467
+ this.handleError(`Transcription error: ${error}`);
468
+ });
469
+ }
470
+ }
471
+ this.isProcessingTranscriptionQueue = false;
472
+ }
473
+
474
+ /**
475
+ * Build prompt from initial prompt and previous slices
476
+ */
477
+ buildPrompt(currentSliceIndex) {
478
+ const promptParts = [];
479
+
480
+ // Add initial prompt if provided
481
+ if (this.options.initialPrompt) {
482
+ promptParts.push(this.options.initialPrompt);
483
+ }
484
+
485
+ // Add previous slice results if enabled
486
+ if (this.options.promptPreviousSlices) {
487
+ // Get transcription results from previous slices (up to the current slice)
488
+ const previousResults = Array.from(this.transcriptionResults.entries()).filter(_ref => {
489
+ let [sliceIndex] = _ref;
490
+ return sliceIndex < currentSliceIndex;
491
+ }).sort((_ref2, _ref3) => {
492
+ let [a] = _ref2;
493
+ let [b] = _ref3;
494
+ return a - b;
495
+ }) // Sort by slice index
496
+ .map(_ref4 => {
497
+ var _result$transcribeEve;
498
+ let [, result] = _ref4;
499
+ return (_result$transcribeEve = result.transcribeEvent.data) === null || _result$transcribeEve === void 0 ? void 0 : _result$transcribeEve.result;
500
+ }).filter(result => Boolean(result)); // Filter out empty results with type guard
501
+
502
+ if (previousResults.length > 0) {
503
+ promptParts.push(...previousResults);
504
+ }
505
+ }
506
+ return promptParts.join(' ') || undefined;
507
+ }
508
+
509
+ /**
510
+ * Process a single transcription
511
+ */
512
+ async processTranscription(item) {
513
+ if (!this.isActive) {
514
+ return;
515
+ }
516
+ this.isTranscribing = true;
517
+
518
+ // Emit stats update for status change
519
+ this.emitStatsUpdate('status_change');
520
+ const startTime = Date.now();
521
+ try {
522
+ var _this$callbacks$onTra, _this$callbacks5;
523
+ // Build prompt from initial prompt and previous slices
524
+ const prompt = this.buildPrompt(item.sliceIndex);
525
+ const audioBuffer = item.audioData.buffer;
526
+ const {
527
+ promise
528
+ } = this.whisperContext.transcribeData(audioBuffer, {
529
+ ...this.options.transcribeOptions,
530
+ prompt,
531
+ // Include the constructed prompt
532
+ onProgress: undefined // Disable progress for realtime
533
+ });
534
+
535
+ const result = await promise;
536
+ const endTime = Date.now();
537
+
538
+ // Create transcribe event
539
+ const {
540
+ sampleRate = 16000
541
+ } = this.options.audioStreamConfig || {};
542
+ const transcribeEvent = {
543
+ type: 'transcribe',
544
+ sliceIndex: item.sliceIndex,
545
+ data: result,
546
+ isCapturing: this.audioStream.isRecording(),
547
+ processTime: endTime - startTime,
548
+ recordingTime: item.audioData.length / (sampleRate / 1000) / 2,
549
+ // ms,
550
+ memoryUsage: this.sliceManager.getMemoryUsage()
551
+ };
552
+
553
+ // Emit transcribe event
554
+ (_this$callbacks$onTra = (_this$callbacks5 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra === void 0 ? void 0 : _this$callbacks$onTra.call(_this$callbacks5, transcribeEvent);
555
+
556
+ // Save transcription results
557
+ const slice = this.sliceManager.getSliceByIndex(item.sliceIndex);
558
+ if (slice) {
559
+ this.transcriptionResults.set(item.sliceIndex, {
560
+ slice: {
561
+ // Don't keep data in the slice
562
+ index: slice.index,
563
+ sampleCount: slice.sampleCount,
564
+ startTime: slice.startTime,
565
+ endTime: slice.endTime,
566
+ isProcessed: slice.isProcessed,
567
+ isReleased: slice.isReleased
568
+ },
569
+ transcribeEvent
570
+ });
571
+ }
572
+
573
+ // Emit stats update for memory/slice changes
574
+ this.emitStatsUpdate('memory_change');
575
+ this.log(`Transcribed speech segment ${item.sliceIndex}: "${result.result}"`);
576
+ } catch (error) {
577
+ var _this$callbacks$onTra2, _this$callbacks6;
578
+ // Emit error event to transcribe callback
579
+ const errorEvent = {
580
+ type: 'error',
581
+ sliceIndex: item.sliceIndex,
582
+ data: undefined,
583
+ isCapturing: this.audioStream.isRecording(),
584
+ processTime: Date.now() - startTime,
585
+ recordingTime: 0,
586
+ memoryUsage: this.sliceManager.getMemoryUsage()
587
+ };
588
+ (_this$callbacks$onTra2 = (_this$callbacks6 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra2 === void 0 ? void 0 : _this$callbacks$onTra2.call(_this$callbacks6, errorEvent);
589
+ this.handleError(`Transcription failed for speech segment ${item.sliceIndex}: ${error}`);
590
+ } finally {
591
+ // Check if we should continue processing queue
592
+ if (this.transcriptionQueue.length > 0) {
593
+ await this.processTranscriptionQueue();
594
+ } else {
595
+ this.isTranscribing = false;
596
+ }
597
+ }
598
+ }
599
+
600
+ /**
601
+ * Handle audio status changes
602
+ */
603
+ handleAudioStatusChange(isRecording) {
604
+ this.log(`Audio recording: ${isRecording ? 'started' : 'stopped'}`);
605
+ }
606
+
607
+ /**
608
+ * Handle errors from components
609
+ */
610
+ handleError(error) {
611
+ var _this$callbacks$onErr, _this$callbacks7;
612
+ this.log(`Error: ${error}`);
613
+ (_this$callbacks$onErr = (_this$callbacks7 = this.callbacks).onError) === null || _this$callbacks$onErr === void 0 ? void 0 : _this$callbacks$onErr.call(_this$callbacks7, error);
614
+ }
615
+
616
+ /**
617
+ * Update callbacks
618
+ */
619
+ updateCallbacks(callbacks) {
620
+ this.callbacks = {
621
+ ...this.callbacks,
622
+ ...callbacks
623
+ };
624
+ }
625
+
626
+ /**
627
+ * Update VAD options dynamically
628
+ */
629
+ updateVadOptions(options) {
630
+ this.options.vadOptions = {
631
+ ...this.options.vadOptions,
632
+ ...options
633
+ };
634
+ }
635
+
636
+ /**
637
+ * Update auto-slice options dynamically
638
+ */
639
+ updateAutoSliceOptions(options) {
640
+ if (options.autoSliceOnSpeechEnd !== undefined) {
641
+ this.options.autoSliceOnSpeechEnd = options.autoSliceOnSpeechEnd;
642
+ }
643
+ if (options.autoSliceThreshold !== undefined) {
644
+ this.options.autoSliceThreshold = options.autoSliceThreshold;
645
+ }
646
+ this.log(`Auto-slice options updated: enabled=${this.options.autoSliceOnSpeechEnd}, threshold=${this.options.autoSliceThreshold}`);
647
+ }
648
+
649
+ /**
650
+ * Get current statistics
651
+ */
652
+ getStatistics() {
653
+ return {
654
+ isActive: this.isActive,
655
+ isTranscribing: this.isTranscribing,
656
+ vadEnabled: this.vadEnabled,
657
+ audioStats: {
658
+ isRecording: this.audioStream.isRecording(),
659
+ accumulatedSamples: this.accumulatedData.length
660
+ },
661
+ vadStats: this.vadEnabled ? {
662
+ enabled: true,
663
+ contextAvailable: !!this.vadContext,
664
+ lastSpeechDetectedTime: this.lastSpeechDetectedTime
665
+ } : null,
666
+ sliceStats: this.sliceManager.getCurrentSliceInfo(),
667
+ autoSliceConfig: {
668
+ enabled: this.options.autoSliceOnSpeechEnd,
669
+ threshold: this.options.autoSliceThreshold,
670
+ targetDuration: this.options.audioSliceSec,
671
+ minDuration: this.options.audioMinSec
672
+ }
673
+ };
674
+ }
675
+
676
+ /**
677
+ * Get all transcription results
678
+ */
679
+ getTranscriptionResults() {
680
+ return Array.from(this.transcriptionResults.values());
681
+ }
682
+
683
+ /**
684
+ * Force move to the next slice, finalizing the current one regardless of capacity
685
+ */
686
+ async nextSlice() {
687
+ var _this$callbacks$onTra3, _this$callbacks8;
688
+ if (!this.isActive) {
689
+ this.log('Cannot force next slice - transcriber is not active');
690
+ return;
691
+ }
692
+
693
+ // Emit start event to indicate slice processing has started
694
+ const startEvent = {
695
+ type: 'start',
696
+ sliceIndex: -1,
697
+ // Use -1 to indicate forced slice
698
+ data: undefined,
699
+ isCapturing: this.audioStream.isRecording(),
700
+ processTime: 0,
701
+ recordingTime: 0,
702
+ memoryUsage: this.sliceManager.getMemoryUsage()
703
+ };
704
+ (_this$callbacks$onTra3 = (_this$callbacks8 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra3 === void 0 ? void 0 : _this$callbacks$onTra3.call(_this$callbacks8, startEvent);
705
+
706
+ // Check if there are pending transcriptions or currently transcribing
707
+ if (this.isTranscribing || this.transcriptionQueue.length > 0) {
708
+ this.log('Waiting for pending transcriptions to complete before forcing next slice...');
709
+
710
+ // Wait for current transcription queue to be processed
711
+ await this.processTranscriptionQueue();
712
+ }
713
+ const result = this.sliceManager.forceNextSlice();
714
+ if (result.slice) {
715
+ this.log(`Forced slice ${result.slice.index} ready (${result.slice.data.length} bytes)`);
716
+
717
+ // Process VAD for the slice if enabled
718
+ if (!this.isTranscribing && this.vadEnabled) {
719
+ this.processSliceVAD(result.slice).catch(error => {
720
+ this.handleError(`VAD processing error: ${error}`);
721
+ });
722
+ } else if (!this.isTranscribing) {
723
+ // If VAD is disabled, transcribe slices as they become ready
724
+ this.queueSliceForTranscription(result.slice).catch(error => {
725
+ this.handleError(`Failed to queue slice for transcription: ${error}`);
726
+ });
727
+ } else {
728
+ this.log(`Skipping slice ${result.slice.index} - already transcribing`);
729
+ }
730
+ this.emitStatsUpdate('memory_change');
731
+ } else {
732
+ this.log('Forced next slice but no slice data to process');
733
+ }
734
+ }
735
+
736
+ /**
737
+ * Reset all components
738
+ */
739
+ reset() {
740
+ this.sliceManager.reset();
741
+ this.transcriptionQueue = [];
742
+ this.isTranscribing = false;
743
+ this.accumulatedData = new Uint8Array(0);
744
+
745
+ // Reset simplified VAD state
746
+ this.lastSpeechDetectedTime = -1;
747
+ this.lastVadState = 'silence';
748
+
749
+ // Reset stats snapshot for clean start
750
+ this.lastStatsSnapshot = null;
751
+
752
+ // Cancel WAV file writing if in progress
753
+ if (this.wavFileWriter) {
754
+ this.wavFileWriter.cancel().catch(error => {
755
+ this.log(`Failed to cancel WAV file writing: ${error}`);
756
+ });
757
+ this.wavFileWriter = null;
758
+ }
759
+
760
+ // Clear transcription results
761
+ this.transcriptionResults.clear();
762
+ }
763
+
764
+ /**
765
+ * Release all resources
766
+ */
767
+ async release() {
768
+ var _this$wavFileWriter;
769
+ if (this.isActive) {
770
+ await this.stop();
771
+ }
772
+ await this.audioStream.release();
773
+ await ((_this$wavFileWriter = this.wavFileWriter) === null || _this$wavFileWriter === void 0 ? void 0 : _this$wavFileWriter.finalize());
774
+ this.vadContext = undefined;
775
+ }
776
+
777
+ /**
778
+ * Emit stats update event if stats have changed significantly
779
+ */
780
+ emitStatsUpdate(eventType) {
781
+ const currentStats = this.getStatistics();
782
+
783
+ // Check if stats have changed significantly
784
+ if (!this.lastStatsSnapshot || RealtimeTranscriber.shouldEmitStatsUpdate(currentStats, this.lastStatsSnapshot)) {
785
+ var _this$callbacks$onSta4, _this$callbacks9;
786
+ const statsEvent = {
787
+ timestamp: Date.now(),
788
+ type: eventType,
789
+ data: currentStats
790
+ };
791
+ (_this$callbacks$onSta4 = (_this$callbacks9 = this.callbacks).onStatsUpdate) === null || _this$callbacks$onSta4 === void 0 ? void 0 : _this$callbacks$onSta4.call(_this$callbacks9, statsEvent);
792
+ this.lastStatsSnapshot = {
793
+ ...currentStats
794
+ };
795
+ }
796
+ }
797
+
798
+ /**
799
+ * Determine if stats update should be emitted
800
+ */
801
+ static shouldEmitStatsUpdate(current, previous) {
802
+ var _current$sliceStats, _current$sliceStats$m, _previous$sliceStats, _previous$sliceStats$;
803
+ // Always emit on status changes
804
+ if (current.isActive !== previous.isActive || current.isTranscribing !== previous.isTranscribing) {
805
+ return true;
806
+ }
807
+
808
+ // Emit on significant memory changes (>10% or >5MB)
809
+ const currentMemory = ((_current$sliceStats = current.sliceStats) === null || _current$sliceStats === void 0 ? void 0 : (_current$sliceStats$m = _current$sliceStats.memoryUsage) === null || _current$sliceStats$m === void 0 ? void 0 : _current$sliceStats$m.estimatedMB) || 0;
810
+ const previousMemory = ((_previous$sliceStats = previous.sliceStats) === null || _previous$sliceStats === void 0 ? void 0 : (_previous$sliceStats$ = _previous$sliceStats.memoryUsage) === null || _previous$sliceStats$ === void 0 ? void 0 : _previous$sliceStats$.estimatedMB) || 0;
811
+ const memoryDiff = Math.abs(currentMemory - previousMemory);
812
+ if (memoryDiff > 5 || previousMemory > 0 && memoryDiff / previousMemory > 0.1) {
813
+ return true;
814
+ }
815
+ return false;
816
+ }
817
+
818
+ /**
819
+ * Logger function
820
+ */
821
+ log(message) {
822
+ this.options.logger(`[RealtimeTranscriber] ${message}`);
823
+ }
824
+ }
825
+ //# sourceMappingURL=RealtimeTranscriber.js.map