whisper.rn 0.5.0-rc.1 → 0.5.0-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +119 -50
- package/lib/commonjs/AudioSessionIos.js +2 -1
- package/lib/commonjs/AudioSessionIos.js.map +1 -1
- package/lib/commonjs/index.js +1 -0
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/jest-mock.js +126 -0
- package/lib/commonjs/jest-mock.js.map +1 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +831 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -0
- package/lib/commonjs/realtime-transcription/SliceManager.js +233 -0
- package/lib/commonjs/realtime-transcription/SliceManager.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js +133 -0
- package/lib/commonjs/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js +201 -0
- package/lib/commonjs/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +309 -0
- package/lib/commonjs/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
- package/lib/commonjs/realtime-transcription/index.js +27 -0
- package/lib/commonjs/realtime-transcription/index.js.map +1 -0
- package/lib/commonjs/realtime-transcription/types.js +114 -0
- package/lib/commonjs/realtime-transcription/types.js.map +1 -0
- package/lib/commonjs/utils/WavFileReader.js +158 -0
- package/lib/commonjs/utils/WavFileReader.js.map +1 -0
- package/lib/commonjs/utils/WavFileWriter.js +181 -0
- package/lib/commonjs/utils/WavFileWriter.js.map +1 -0
- package/lib/commonjs/utils/common.js +25 -0
- package/lib/commonjs/utils/common.js.map +1 -0
- package/lib/module/AudioSessionIos.js +2 -1
- package/lib/module/AudioSessionIos.js.map +1 -1
- package/lib/module/index.js +1 -0
- package/lib/module/index.js.map +1 -1
- package/lib/module/jest-mock.js +124 -0
- package/lib/module/jest-mock.js.map +1 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +825 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -0
- package/lib/module/realtime-transcription/SliceManager.js +226 -0
- package/lib/module/realtime-transcription/SliceManager.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js +124 -0
- package/lib/module/realtime-transcription/adapters/AudioPcmStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js +194 -0
- package/lib/module/realtime-transcription/adapters/JestAudioStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js +302 -0
- package/lib/module/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.js.map +1 -0
- package/lib/module/realtime-transcription/index.js +8 -0
- package/lib/module/realtime-transcription/index.js.map +1 -0
- package/lib/module/realtime-transcription/types.js +107 -0
- package/lib/module/realtime-transcription/types.js.map +1 -0
- package/lib/module/utils/WavFileReader.js +151 -0
- package/lib/module/utils/WavFileReader.js.map +1 -0
- package/lib/module/utils/WavFileWriter.js +174 -0
- package/lib/module/utils/WavFileWriter.js.map +1 -0
- package/lib/module/utils/common.js +18 -0
- package/lib/module/utils/common.js.map +1 -0
- package/lib/typescript/AudioSessionIos.d.ts +1 -1
- package/lib/typescript/AudioSessionIos.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/lib/typescript/jest-mock.d.ts +2 -0
- package/lib/typescript/jest-mock.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +165 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/SliceManager.d.ts +72 -0
- package/lib/typescript/realtime-transcription/SliceManager.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts +22 -0
- package/lib/typescript/realtime-transcription/adapters/AudioPcmStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts +44 -0
- package/lib/typescript/realtime-transcription/adapters/JestAudioStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts +75 -0
- package/lib/typescript/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/index.d.ts +6 -0
- package/lib/typescript/realtime-transcription/index.d.ts.map +1 -0
- package/lib/typescript/realtime-transcription/types.d.ts +216 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -0
- package/lib/typescript/utils/WavFileReader.d.ts +61 -0
- package/lib/typescript/utils/WavFileReader.d.ts.map +1 -0
- package/lib/typescript/utils/WavFileWriter.d.ts +57 -0
- package/lib/typescript/utils/WavFileWriter.d.ts.map +1 -0
- package/lib/typescript/utils/common.d.ts +9 -0
- package/lib/typescript/utils/common.d.ts.map +1 -0
- package/package.json +18 -6
- package/src/AudioSessionIos.ts +3 -2
- package/src/index.ts +4 -0
- package/{jest/mock.js → src/jest-mock.ts} +2 -2
- package/src/realtime-transcription/RealtimeTranscriber.ts +983 -0
- package/src/realtime-transcription/SliceManager.ts +252 -0
- package/src/realtime-transcription/adapters/AudioPcmStreamAdapter.ts +143 -0
- package/src/realtime-transcription/adapters/JestAudioStreamAdapter.ts +251 -0
- package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts +378 -0
- package/src/realtime-transcription/index.ts +34 -0
- package/src/realtime-transcription/types.ts +277 -0
- package/src/utils/WavFileReader.ts +202 -0
- package/src/utils/WavFileWriter.ts +206 -0
- package/src/utils/common.ts +17 -0
|
@@ -0,0 +1,825 @@
|
|
|
1
|
+
/* eslint-disable class-methods-use-this */
|
|
2
|
+
|
|
3
|
+
import { SliceManager } from './SliceManager';
|
|
4
|
+
import { WavFileWriter } from '../utils/WavFileWriter';
|
|
5
|
+
import { VAD_PRESETS } from './types';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* RealtimeTranscriber provides real-time audio transcription with VAD support.
|
|
9
|
+
*
|
|
10
|
+
* Features:
|
|
11
|
+
* - Automatic slice management based on duration
|
|
12
|
+
* - VAD-based speech detection and auto-slicing
|
|
13
|
+
* - Configurable auto-slice mechanism that triggers on speech_end/silence events
|
|
14
|
+
* - Memory management for audio slices
|
|
15
|
+
* - Queue-based transcription processing
|
|
16
|
+
*/
|
|
17
|
+
export class RealtimeTranscriber {
|
|
18
|
+
callbacks = {};
|
|
19
|
+
isActive = false;
|
|
20
|
+
isTranscribing = false;
|
|
21
|
+
vadEnabled = false;
|
|
22
|
+
transcriptionQueue = [];
|
|
23
|
+
accumulatedData = new Uint8Array(0);
|
|
24
|
+
wavFileWriter = null;
|
|
25
|
+
|
|
26
|
+
// Simplified VAD state management
|
|
27
|
+
lastSpeechDetectedTime = 0;
|
|
28
|
+
|
|
29
|
+
// Track VAD state for proper event transitions
|
|
30
|
+
lastVadState = 'silence';
|
|
31
|
+
|
|
32
|
+
// Track last stats to emit only when changed
|
|
33
|
+
lastStatsSnapshot = null;
|
|
34
|
+
|
|
35
|
+
// Store transcription results by slice index
|
|
36
|
+
transcriptionResults = new Map();
|
|
37
|
+
constructor(dependencies) {
|
|
38
|
+
let options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
|
|
39
|
+
let callbacks = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
|
|
40
|
+
this.whisperContext = dependencies.whisperContext;
|
|
41
|
+
this.vadContext = dependencies.vadContext;
|
|
42
|
+
this.audioStream = dependencies.audioStream;
|
|
43
|
+
this.fs = dependencies.fs;
|
|
44
|
+
this.callbacks = callbacks;
|
|
45
|
+
|
|
46
|
+
// Set default options with proper types
|
|
47
|
+
this.options = {
|
|
48
|
+
audioSliceSec: options.audioSliceSec || 30,
|
|
49
|
+
audioMinSec: options.audioMinSec || 1,
|
|
50
|
+
maxSlicesInMemory: options.maxSlicesInMemory || 3,
|
|
51
|
+
vadOptions: options.vadOptions || VAD_PRESETS.default,
|
|
52
|
+
vadPreset: options.vadPreset,
|
|
53
|
+
autoSliceOnSpeechEnd: options.autoSliceOnSpeechEnd || true,
|
|
54
|
+
autoSliceThreshold: options.autoSliceThreshold || 0.5,
|
|
55
|
+
transcribeOptions: options.transcribeOptions || {},
|
|
56
|
+
initialPrompt: options.initialPrompt,
|
|
57
|
+
promptPreviousSlices: options.promptPreviousSlices ?? true,
|
|
58
|
+
audioOutputPath: options.audioOutputPath,
|
|
59
|
+
logger: options.logger || (() => {})
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Apply VAD preset if specified
|
|
63
|
+
if (this.options.vadPreset && VAD_PRESETS[this.options.vadPreset]) {
|
|
64
|
+
this.options.vadOptions = {
|
|
65
|
+
...VAD_PRESETS[this.options.vadPreset],
|
|
66
|
+
...this.options.vadOptions
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Enable VAD if context is provided and not explicitly disabled
|
|
71
|
+
this.vadEnabled = !!this.vadContext;
|
|
72
|
+
|
|
73
|
+
// Initialize managers
|
|
74
|
+
this.sliceManager = new SliceManager(this.options.audioSliceSec, this.options.maxSlicesInMemory);
|
|
75
|
+
|
|
76
|
+
// Set up audio stream callbacks
|
|
77
|
+
this.audioStream.onData(this.handleAudioData.bind(this));
|
|
78
|
+
this.audioStream.onError(this.handleError.bind(this));
|
|
79
|
+
this.audioStream.onStatusChange(this.handleAudioStatusChange.bind(this));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Start realtime transcription
|
|
84
|
+
*/
|
|
85
|
+
async start() {
|
|
86
|
+
if (this.isActive) {
|
|
87
|
+
throw new Error('Realtime transcription is already active');
|
|
88
|
+
}
|
|
89
|
+
try {
|
|
90
|
+
var _this$callbacks$onSta, _this$callbacks, _this$options$audioSt4, _this$options$audioSt5, _this$options$audioSt6, _this$options$audioSt7, _this$options$audioSt8;
|
|
91
|
+
this.isActive = true;
|
|
92
|
+
(_this$callbacks$onSta = (_this$callbacks = this.callbacks).onStatusChange) === null || _this$callbacks$onSta === void 0 ? void 0 : _this$callbacks$onSta.call(_this$callbacks, true);
|
|
93
|
+
|
|
94
|
+
// Reset all state to ensure clean start
|
|
95
|
+
this.reset();
|
|
96
|
+
|
|
97
|
+
// Initialize WAV file writer if output path is specified
|
|
98
|
+
if (this.fs && this.options.audioOutputPath) {
|
|
99
|
+
var _this$options$audioSt, _this$options$audioSt2, _this$options$audioSt3;
|
|
100
|
+
this.wavFileWriter = new WavFileWriter(this.fs, this.options.audioOutputPath, {
|
|
101
|
+
sampleRate: ((_this$options$audioSt = this.options.audioStreamConfig) === null || _this$options$audioSt === void 0 ? void 0 : _this$options$audioSt.sampleRate) || 16000,
|
|
102
|
+
channels: ((_this$options$audioSt2 = this.options.audioStreamConfig) === null || _this$options$audioSt2 === void 0 ? void 0 : _this$options$audioSt2.channels) || 1,
|
|
103
|
+
bitsPerSample: ((_this$options$audioSt3 = this.options.audioStreamConfig) === null || _this$options$audioSt3 === void 0 ? void 0 : _this$options$audioSt3.bitsPerSample) || 16
|
|
104
|
+
});
|
|
105
|
+
await this.wavFileWriter.initialize();
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Start audio recording
|
|
109
|
+
await this.audioStream.initialize({
|
|
110
|
+
sampleRate: ((_this$options$audioSt4 = this.options.audioStreamConfig) === null || _this$options$audioSt4 === void 0 ? void 0 : _this$options$audioSt4.sampleRate) || 16000,
|
|
111
|
+
channels: ((_this$options$audioSt5 = this.options.audioStreamConfig) === null || _this$options$audioSt5 === void 0 ? void 0 : _this$options$audioSt5.channels) || 1,
|
|
112
|
+
bitsPerSample: ((_this$options$audioSt6 = this.options.audioStreamConfig) === null || _this$options$audioSt6 === void 0 ? void 0 : _this$options$audioSt6.bitsPerSample) || 16,
|
|
113
|
+
audioSource: ((_this$options$audioSt7 = this.options.audioStreamConfig) === null || _this$options$audioSt7 === void 0 ? void 0 : _this$options$audioSt7.audioSource) || 6,
|
|
114
|
+
bufferSize: ((_this$options$audioSt8 = this.options.audioStreamConfig) === null || _this$options$audioSt8 === void 0 ? void 0 : _this$options$audioSt8.bufferSize) || 16 * 1024
|
|
115
|
+
});
|
|
116
|
+
await this.audioStream.start();
|
|
117
|
+
|
|
118
|
+
// Emit stats update for status change
|
|
119
|
+
this.emitStatsUpdate('status_change');
|
|
120
|
+
this.log('Realtime transcription started');
|
|
121
|
+
} catch (error) {
|
|
122
|
+
var _this$callbacks$onSta2, _this$callbacks2;
|
|
123
|
+
this.isActive = false;
|
|
124
|
+
(_this$callbacks$onSta2 = (_this$callbacks2 = this.callbacks).onStatusChange) === null || _this$callbacks$onSta2 === void 0 ? void 0 : _this$callbacks$onSta2.call(_this$callbacks2, false);
|
|
125
|
+
throw error;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Stop realtime transcription
|
|
131
|
+
*/
|
|
132
|
+
async stop() {
|
|
133
|
+
if (!this.isActive) {
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
try {
|
|
137
|
+
var _this$callbacks$onSta3, _this$callbacks3;
|
|
138
|
+
this.isActive = false;
|
|
139
|
+
|
|
140
|
+
// Stop audio recording
|
|
141
|
+
await this.audioStream.stop();
|
|
142
|
+
|
|
143
|
+
// Process any remaining accumulated data
|
|
144
|
+
if (this.accumulatedData.length > 0) {
|
|
145
|
+
this.processAccumulatedDataForSliceManagement();
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Process any remaining queued transcriptions
|
|
149
|
+
await this.processTranscriptionQueue();
|
|
150
|
+
|
|
151
|
+
// Finalize WAV file
|
|
152
|
+
if (this.wavFileWriter) {
|
|
153
|
+
await this.wavFileWriter.finalize();
|
|
154
|
+
this.wavFileWriter = null;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Reset all state completely
|
|
158
|
+
this.reset();
|
|
159
|
+
(_this$callbacks$onSta3 = (_this$callbacks3 = this.callbacks).onStatusChange) === null || _this$callbacks$onSta3 === void 0 ? void 0 : _this$callbacks$onSta3.call(_this$callbacks3, false);
|
|
160
|
+
|
|
161
|
+
// Emit stats update for status change
|
|
162
|
+
this.emitStatsUpdate('status_change');
|
|
163
|
+
this.log('Realtime transcription stopped');
|
|
164
|
+
} catch (error) {
|
|
165
|
+
this.handleError(`Stop error: ${error}`);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Handle incoming audio data from audio stream
|
|
171
|
+
*/
|
|
172
|
+
handleAudioData(streamData) {
|
|
173
|
+
if (!this.isActive) {
|
|
174
|
+
return;
|
|
175
|
+
}
|
|
176
|
+
try {
|
|
177
|
+
// Write to WAV file if enabled (convert to Uint8Array for WavFileWriter)
|
|
178
|
+
if (this.wavFileWriter) {
|
|
179
|
+
this.wavFileWriter.appendAudioData(streamData.data).catch(error => {
|
|
180
|
+
this.log(`Failed to write audio to WAV file: ${error}`);
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Always accumulate data for slice management
|
|
185
|
+
this.accumulateAudioData(streamData.data);
|
|
186
|
+
} catch (error) {
|
|
187
|
+
const errorMessage = error instanceof Error ? error.message : 'Audio processing error';
|
|
188
|
+
this.handleError(errorMessage);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Accumulate audio data for slice management
|
|
194
|
+
*/
|
|
195
|
+
accumulateAudioData(newData) {
|
|
196
|
+
const combined = new Uint8Array(this.accumulatedData.length + newData.length);
|
|
197
|
+
combined.set(this.accumulatedData);
|
|
198
|
+
combined.set(new Uint8Array(newData), this.accumulatedData.length);
|
|
199
|
+
this.accumulatedData = combined;
|
|
200
|
+
|
|
201
|
+
// Process accumulated data when we have enough for slice management
|
|
202
|
+
const minBufferSamples = 16000 * 1; // 1 second for slice management
|
|
203
|
+
if (this.accumulatedData.length >= minBufferSamples) {
|
|
204
|
+
this.processAccumulatedDataForSliceManagement();
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Process accumulated audio data through SliceManager
|
|
210
|
+
*/
|
|
211
|
+
processAccumulatedDataForSliceManagement() {
|
|
212
|
+
if (this.accumulatedData.length === 0) {
|
|
213
|
+
return;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Process through slice manager directly with Uint8Array
|
|
217
|
+
const result = this.sliceManager.addAudioData(this.accumulatedData);
|
|
218
|
+
if (result.slice) {
|
|
219
|
+
this.log(`Slice ${result.slice.index} ready (${result.slice.data.length} bytes)`);
|
|
220
|
+
|
|
221
|
+
// Process VAD for the slice if enabled
|
|
222
|
+
if (!this.isTranscribing && this.vadEnabled) {
|
|
223
|
+
this.processSliceVAD(result.slice).catch(error => {
|
|
224
|
+
this.handleError(`VAD processing error: ${error}`);
|
|
225
|
+
});
|
|
226
|
+
} else if (!this.isTranscribing) {
|
|
227
|
+
// If VAD is disabled, transcribe slices as they become ready
|
|
228
|
+
this.queueSliceForTranscription(result.slice).catch(error => {
|
|
229
|
+
this.handleError(`Failed to queue slice for transcription: ${error}`);
|
|
230
|
+
});
|
|
231
|
+
} else {
|
|
232
|
+
this.log(`Skipping slice ${result.slice.index} - already transcribing`);
|
|
233
|
+
}
|
|
234
|
+
this.emitStatsUpdate('memory_change');
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Clear accumulated data
|
|
238
|
+
this.accumulatedData = new Uint8Array(0);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Check if auto-slice should be triggered based on VAD event and timing
|
|
243
|
+
*/
|
|
244
|
+
async checkAutoSlice(vadEvent, _slice) {
|
|
245
|
+
if (!this.options.autoSliceOnSpeechEnd || !this.vadEnabled) {
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Only trigger on speech_end or silence events
|
|
250
|
+
const shouldTriggerAutoSlice = vadEvent.type === 'speech_end' || vadEvent.type === 'silence';
|
|
251
|
+
if (!shouldTriggerAutoSlice) {
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Get current slice info from SliceManager
|
|
256
|
+
const currentSliceInfo = this.sliceManager.getCurrentSliceInfo();
|
|
257
|
+
const currentSlice = this.sliceManager.getSliceByIndex(currentSliceInfo.currentSliceIndex);
|
|
258
|
+
if (!currentSlice) {
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Calculate current slice duration
|
|
263
|
+
const currentDuration = (Date.now() - currentSlice.startTime) / 1000; // Convert to seconds
|
|
264
|
+
const targetDuration = this.options.audioSliceSec;
|
|
265
|
+
const minDuration = this.options.audioMinSec;
|
|
266
|
+
const autoSliceThreshold = targetDuration * this.options.autoSliceThreshold;
|
|
267
|
+
|
|
268
|
+
// Check if conditions are met for auto-slice
|
|
269
|
+
const meetsMinDuration = currentDuration >= minDuration;
|
|
270
|
+
const meetsThreshold = currentDuration >= autoSliceThreshold;
|
|
271
|
+
if (meetsMinDuration && meetsThreshold) {
|
|
272
|
+
this.log(`Auto-slicing on ${vadEvent.type} at ${currentDuration.toFixed(1)}s ` + `(min: ${minDuration}s, threshold: ${autoSliceThreshold.toFixed(1)}s, target: ${targetDuration}s)`);
|
|
273
|
+
|
|
274
|
+
// Force next slice
|
|
275
|
+
await this.nextSlice();
|
|
276
|
+
} else {
|
|
277
|
+
this.log(`Auto-slice conditions not met on ${vadEvent.type}: ` + `duration=${currentDuration.toFixed(1)}s, min=${minDuration}s, threshold=${autoSliceThreshold.toFixed(1)}s ` + `(minOk=${meetsMinDuration}, thresholdOk=${meetsThreshold})`);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Process VAD for a completed slice
|
|
283
|
+
*/
|
|
284
|
+
async processSliceVAD(slice) {
|
|
285
|
+
try {
|
|
286
|
+
var _this$callbacks$onVad, _this$callbacks4;
|
|
287
|
+
// Get audio data from the slice for VAD processing
|
|
288
|
+
const audioData = this.sliceManager.getAudioDataForTranscription(slice.index);
|
|
289
|
+
if (!audioData) {
|
|
290
|
+
this.log(`No audio data available for VAD processing of slice ${slice.index}`);
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Convert base64 back to Uint8Array for VAD processing
|
|
295
|
+
|
|
296
|
+
// Detect speech in the slice
|
|
297
|
+
const vadEvent = await this.detectSpeech(audioData, slice.index);
|
|
298
|
+
vadEvent.timestamp = Date.now();
|
|
299
|
+
|
|
300
|
+
// Emit VAD event
|
|
301
|
+
(_this$callbacks$onVad = (_this$callbacks4 = this.callbacks).onVad) === null || _this$callbacks$onVad === void 0 ? void 0 : _this$callbacks$onVad.call(_this$callbacks4, vadEvent);
|
|
302
|
+
|
|
303
|
+
// Check if auto-slice should be triggered
|
|
304
|
+
await this.checkAutoSlice(vadEvent, slice);
|
|
305
|
+
|
|
306
|
+
// Check if speech was detected and if we should transcribe
|
|
307
|
+
const isSpeech = vadEvent.type === 'speech_start' || vadEvent.type === 'speech_continue';
|
|
308
|
+
const isSpeechEnd = vadEvent.type === 'speech_end';
|
|
309
|
+
if (isSpeech) {
|
|
310
|
+
const minDuration = this.options.audioMinSec;
|
|
311
|
+
// Check minimum duration requirement
|
|
312
|
+
const speechDuration = slice.data.length / 16000 / 2; // Convert bytes to seconds (16kHz, 16-bit)
|
|
313
|
+
|
|
314
|
+
if (speechDuration >= minDuration) {
|
|
315
|
+
this.log(`Speech detected in slice ${slice.index}, queueing for transcription`);
|
|
316
|
+
await this.queueSliceForTranscription(slice);
|
|
317
|
+
} else {
|
|
318
|
+
this.log(`Speech too short in slice ${slice.index} (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`);
|
|
319
|
+
}
|
|
320
|
+
} else if (isSpeechEnd) {
|
|
321
|
+
this.log(`Speech ended in slice ${slice.index}`);
|
|
322
|
+
// For speech_end events, we might want to queue the slice for transcription
|
|
323
|
+
// to capture the final part of the speech segment
|
|
324
|
+
const speechDuration = slice.data.length / 16000 / 2; // Convert bytes to seconds
|
|
325
|
+
const minDuration = this.options.audioMinSec;
|
|
326
|
+
if (speechDuration >= minDuration) {
|
|
327
|
+
this.log(`Speech end detected in slice ${slice.index}, queueing final segment for transcription`);
|
|
328
|
+
await this.queueSliceForTranscription(slice);
|
|
329
|
+
} else {
|
|
330
|
+
this.log(`Speech end segment too short in slice ${slice.index} (${speechDuration.toFixed(2)}s < ${minDuration}s), skipping`);
|
|
331
|
+
}
|
|
332
|
+
} else {
|
|
333
|
+
this.log(`No speech detected in slice ${slice.index}`);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Emit stats update for VAD change
|
|
337
|
+
this.emitStatsUpdate('vad_change');
|
|
338
|
+
} catch (error) {
|
|
339
|
+
this.handleError(`VAD processing error for slice ${slice.index}: ${error}`);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Queue a slice for transcription
|
|
345
|
+
*/
|
|
346
|
+
async queueSliceForTranscription(slice) {
|
|
347
|
+
try {
|
|
348
|
+
// Get audio data from the slice
|
|
349
|
+
const audioData = this.sliceManager.getAudioDataForTranscription(slice.index);
|
|
350
|
+
if (!audioData) {
|
|
351
|
+
this.log(`No audio data available for slice ${slice.index}`);
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Add to transcription queue
|
|
356
|
+
this.transcriptionQueue.unshift({
|
|
357
|
+
sliceIndex: slice.index,
|
|
358
|
+
audioData
|
|
359
|
+
});
|
|
360
|
+
this.log(`Queued slice ${slice.index} for transcription (${slice.data.length} samples)`);
|
|
361
|
+
await this.processTranscriptionQueue();
|
|
362
|
+
} catch (error) {
|
|
363
|
+
this.handleError(`Failed to queue slice for transcription: ${error}`);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Detect speech using VAD context
|
|
369
|
+
*/
|
|
370
|
+
async detectSpeech(audioData, sliceIndex) {
|
|
371
|
+
if (!this.vadContext) {
|
|
372
|
+
// When no VAD context is available, assume speech is always detected
|
|
373
|
+
// but still follow the state machine pattern
|
|
374
|
+
const currentTimestamp = Date.now();
|
|
375
|
+
|
|
376
|
+
// Assume speech is always detected when no VAD context
|
|
377
|
+
const vadEventType = this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue';
|
|
378
|
+
|
|
379
|
+
// Update VAD state
|
|
380
|
+
this.lastVadState = 'speech';
|
|
381
|
+
const {
|
|
382
|
+
sampleRate = 16000
|
|
383
|
+
} = this.options.audioStreamConfig || {};
|
|
384
|
+
return {
|
|
385
|
+
type: vadEventType,
|
|
386
|
+
lastSpeechDetectedTime: 0,
|
|
387
|
+
timestamp: currentTimestamp,
|
|
388
|
+
confidence: 1.0,
|
|
389
|
+
duration: audioData.length / sampleRate / 2,
|
|
390
|
+
// Convert bytes to seconds
|
|
391
|
+
sliceIndex
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
try {
|
|
395
|
+
const audioBuffer = audioData.buffer;
|
|
396
|
+
|
|
397
|
+
// Use VAD context to detect speech segments
|
|
398
|
+
const vadSegments = await this.vadContext.detectSpeechData(audioBuffer, this.options.vadOptions);
|
|
399
|
+
|
|
400
|
+
// Calculate confidence based on speech segments
|
|
401
|
+
let confidence = 0.0;
|
|
402
|
+
let lastSpeechDetectedTime = 0;
|
|
403
|
+
if (vadSegments && vadSegments.length > 0) {
|
|
404
|
+
var _vadSegments;
|
|
405
|
+
// If there are speech segments, calculate average confidence
|
|
406
|
+
const totalTime = vadSegments.reduce((sum, segment) => sum + (segment.t1 - segment.t0), 0);
|
|
407
|
+
const audioDuration = audioData.length / 16000 / 2; // Convert bytes to seconds
|
|
408
|
+
confidence = totalTime > 0 ? Math.min(totalTime / audioDuration, 1.0) : 0.0;
|
|
409
|
+
lastSpeechDetectedTime = ((_vadSegments = vadSegments[vadSegments.length - 1]) === null || _vadSegments === void 0 ? void 0 : _vadSegments.t1) || -1;
|
|
410
|
+
}
|
|
411
|
+
const threshold = this.options.vadOptions.threshold || 0.5;
|
|
412
|
+
let isSpeech = confidence > threshold;
|
|
413
|
+
const currentTimestamp = Date.now();
|
|
414
|
+
|
|
415
|
+
// Determine VAD event type based on current and previous state
|
|
416
|
+
let vadEventType;
|
|
417
|
+
if (isSpeech) {
|
|
418
|
+
vadEventType = this.lastVadState === 'silence' ? 'speech_start' : 'speech_continue';
|
|
419
|
+
const minDuration = this.options.audioMinSec;
|
|
420
|
+
// Check if this is a new speech detection (different from last detected time)
|
|
421
|
+
if (lastSpeechDetectedTime === this.lastSpeechDetectedTime || (lastSpeechDetectedTime - this.lastSpeechDetectedTime) / 100 < minDuration) {
|
|
422
|
+
if (this.lastVadState === 'silence') vadEventType = 'silence';
|
|
423
|
+
if (this.lastVadState === 'speech') vadEventType = 'speech_end';
|
|
424
|
+
isSpeech = false;
|
|
425
|
+
confidence = 0.0;
|
|
426
|
+
}
|
|
427
|
+
this.lastSpeechDetectedTime = lastSpeechDetectedTime;
|
|
428
|
+
} else {
|
|
429
|
+
vadEventType = this.lastVadState === 'speech' ? 'speech_end' : 'silence';
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// Update VAD state for next detection
|
|
433
|
+
this.lastVadState = isSpeech ? 'speech' : 'silence';
|
|
434
|
+
const {
|
|
435
|
+
sampleRate = 16000
|
|
436
|
+
} = this.options.audioStreamConfig || {};
|
|
437
|
+
return {
|
|
438
|
+
type: vadEventType,
|
|
439
|
+
lastSpeechDetectedTime,
|
|
440
|
+
timestamp: currentTimestamp,
|
|
441
|
+
confidence,
|
|
442
|
+
duration: audioData.length / sampleRate / 2,
|
|
443
|
+
// Convert bytes to seconds
|
|
444
|
+
sliceIndex,
|
|
445
|
+
currentThreshold: threshold
|
|
446
|
+
};
|
|
447
|
+
} catch (error) {
|
|
448
|
+
this.log(`VAD detection error: ${error}`);
|
|
449
|
+
// Re-throw the error so it can be handled by the caller
|
|
450
|
+
throw error;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
isProcessingTranscriptionQueue = false;
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Process the transcription queue
|
|
457
|
+
*/
|
|
458
|
+
async processTranscriptionQueue() {
|
|
459
|
+
if (this.isProcessingTranscriptionQueue) return;
|
|
460
|
+
this.isProcessingTranscriptionQueue = true;
|
|
461
|
+
while (this.transcriptionQueue.length > 0) {
|
|
462
|
+
const item = this.transcriptionQueue.shift();
|
|
463
|
+
this.transcriptionQueue = []; // Old items are not needed anymore
|
|
464
|
+
if (item) {
|
|
465
|
+
// eslint-disable-next-line no-await-in-loop
|
|
466
|
+
await this.processTranscription(item).catch(error => {
|
|
467
|
+
this.handleError(`Transcription error: ${error}`);
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
this.isProcessingTranscriptionQueue = false;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
/**
|
|
475
|
+
* Build prompt from initial prompt and previous slices
|
|
476
|
+
*/
|
|
477
|
+
buildPrompt(currentSliceIndex) {
|
|
478
|
+
const promptParts = [];
|
|
479
|
+
|
|
480
|
+
// Add initial prompt if provided
|
|
481
|
+
if (this.options.initialPrompt) {
|
|
482
|
+
promptParts.push(this.options.initialPrompt);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Add previous slice results if enabled
|
|
486
|
+
if (this.options.promptPreviousSlices) {
|
|
487
|
+
// Get transcription results from previous slices (up to the current slice)
|
|
488
|
+
const previousResults = Array.from(this.transcriptionResults.entries()).filter(_ref => {
|
|
489
|
+
let [sliceIndex] = _ref;
|
|
490
|
+
return sliceIndex < currentSliceIndex;
|
|
491
|
+
}).sort((_ref2, _ref3) => {
|
|
492
|
+
let [a] = _ref2;
|
|
493
|
+
let [b] = _ref3;
|
|
494
|
+
return a - b;
|
|
495
|
+
}) // Sort by slice index
|
|
496
|
+
.map(_ref4 => {
|
|
497
|
+
var _result$transcribeEve;
|
|
498
|
+
let [, result] = _ref4;
|
|
499
|
+
return (_result$transcribeEve = result.transcribeEvent.data) === null || _result$transcribeEve === void 0 ? void 0 : _result$transcribeEve.result;
|
|
500
|
+
}).filter(result => Boolean(result)); // Filter out empty results with type guard
|
|
501
|
+
|
|
502
|
+
if (previousResults.length > 0) {
|
|
503
|
+
promptParts.push(...previousResults);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
return promptParts.join(' ') || undefined;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/**
|
|
510
|
+
* Process a single transcription
|
|
511
|
+
*/
|
|
512
|
+
async processTranscription(item) {
|
|
513
|
+
if (!this.isActive) {
|
|
514
|
+
return;
|
|
515
|
+
}
|
|
516
|
+
this.isTranscribing = true;
|
|
517
|
+
|
|
518
|
+
// Emit stats update for status change
|
|
519
|
+
this.emitStatsUpdate('status_change');
|
|
520
|
+
const startTime = Date.now();
|
|
521
|
+
try {
|
|
522
|
+
var _this$callbacks$onTra, _this$callbacks5;
|
|
523
|
+
// Build prompt from initial prompt and previous slices
|
|
524
|
+
const prompt = this.buildPrompt(item.sliceIndex);
|
|
525
|
+
const audioBuffer = item.audioData.buffer;
|
|
526
|
+
const {
|
|
527
|
+
promise
|
|
528
|
+
} = this.whisperContext.transcribeData(audioBuffer, {
|
|
529
|
+
...this.options.transcribeOptions,
|
|
530
|
+
prompt,
|
|
531
|
+
// Include the constructed prompt
|
|
532
|
+
onProgress: undefined // Disable progress for realtime
|
|
533
|
+
});
|
|
534
|
+
|
|
535
|
+
const result = await promise;
|
|
536
|
+
const endTime = Date.now();
|
|
537
|
+
|
|
538
|
+
// Create transcribe event
|
|
539
|
+
const {
|
|
540
|
+
sampleRate = 16000
|
|
541
|
+
} = this.options.audioStreamConfig || {};
|
|
542
|
+
const transcribeEvent = {
|
|
543
|
+
type: 'transcribe',
|
|
544
|
+
sliceIndex: item.sliceIndex,
|
|
545
|
+
data: result,
|
|
546
|
+
isCapturing: this.audioStream.isRecording(),
|
|
547
|
+
processTime: endTime - startTime,
|
|
548
|
+
recordingTime: item.audioData.length / (sampleRate / 1000) / 2,
|
|
549
|
+
// ms,
|
|
550
|
+
memoryUsage: this.sliceManager.getMemoryUsage()
|
|
551
|
+
};
|
|
552
|
+
|
|
553
|
+
// Emit transcribe event
|
|
554
|
+
(_this$callbacks$onTra = (_this$callbacks5 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra === void 0 ? void 0 : _this$callbacks$onTra.call(_this$callbacks5, transcribeEvent);
|
|
555
|
+
|
|
556
|
+
// Save transcription results
|
|
557
|
+
const slice = this.sliceManager.getSliceByIndex(item.sliceIndex);
|
|
558
|
+
if (slice) {
|
|
559
|
+
this.transcriptionResults.set(item.sliceIndex, {
|
|
560
|
+
slice: {
|
|
561
|
+
// Don't keep data in the slice
|
|
562
|
+
index: slice.index,
|
|
563
|
+
sampleCount: slice.sampleCount,
|
|
564
|
+
startTime: slice.startTime,
|
|
565
|
+
endTime: slice.endTime,
|
|
566
|
+
isProcessed: slice.isProcessed,
|
|
567
|
+
isReleased: slice.isReleased
|
|
568
|
+
},
|
|
569
|
+
transcribeEvent
|
|
570
|
+
});
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// Emit stats update for memory/slice changes
|
|
574
|
+
this.emitStatsUpdate('memory_change');
|
|
575
|
+
this.log(`Transcribed speech segment ${item.sliceIndex}: "${result.result}"`);
|
|
576
|
+
} catch (error) {
|
|
577
|
+
var _this$callbacks$onTra2, _this$callbacks6;
|
|
578
|
+
// Emit error event to transcribe callback
|
|
579
|
+
const errorEvent = {
|
|
580
|
+
type: 'error',
|
|
581
|
+
sliceIndex: item.sliceIndex,
|
|
582
|
+
data: undefined,
|
|
583
|
+
isCapturing: this.audioStream.isRecording(),
|
|
584
|
+
processTime: Date.now() - startTime,
|
|
585
|
+
recordingTime: 0,
|
|
586
|
+
memoryUsage: this.sliceManager.getMemoryUsage()
|
|
587
|
+
};
|
|
588
|
+
(_this$callbacks$onTra2 = (_this$callbacks6 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra2 === void 0 ? void 0 : _this$callbacks$onTra2.call(_this$callbacks6, errorEvent);
|
|
589
|
+
this.handleError(`Transcription failed for speech segment ${item.sliceIndex}: ${error}`);
|
|
590
|
+
} finally {
|
|
591
|
+
// Check if we should continue processing queue
|
|
592
|
+
if (this.transcriptionQueue.length > 0) {
|
|
593
|
+
await this.processTranscriptionQueue();
|
|
594
|
+
} else {
|
|
595
|
+
this.isTranscribing = false;
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
/**
|
|
601
|
+
* Handle audio status changes
|
|
602
|
+
*/
|
|
603
|
+
handleAudioStatusChange(isRecording) {
|
|
604
|
+
this.log(`Audio recording: ${isRecording ? 'started' : 'stopped'}`);
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Handle errors from components
|
|
609
|
+
*/
|
|
610
|
+
handleError(error) {
|
|
611
|
+
var _this$callbacks$onErr, _this$callbacks7;
|
|
612
|
+
this.log(`Error: ${error}`);
|
|
613
|
+
(_this$callbacks$onErr = (_this$callbacks7 = this.callbacks).onError) === null || _this$callbacks$onErr === void 0 ? void 0 : _this$callbacks$onErr.call(_this$callbacks7, error);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
/**
|
|
617
|
+
* Update callbacks
|
|
618
|
+
*/
|
|
619
|
+
updateCallbacks(callbacks) {
|
|
620
|
+
this.callbacks = {
|
|
621
|
+
...this.callbacks,
|
|
622
|
+
...callbacks
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* Update VAD options dynamically
|
|
628
|
+
*/
|
|
629
|
+
updateVadOptions(options) {
|
|
630
|
+
this.options.vadOptions = {
|
|
631
|
+
...this.options.vadOptions,
|
|
632
|
+
...options
|
|
633
|
+
};
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
/**
|
|
637
|
+
* Update auto-slice options dynamically
|
|
638
|
+
*/
|
|
639
|
+
updateAutoSliceOptions(options) {
|
|
640
|
+
if (options.autoSliceOnSpeechEnd !== undefined) {
|
|
641
|
+
this.options.autoSliceOnSpeechEnd = options.autoSliceOnSpeechEnd;
|
|
642
|
+
}
|
|
643
|
+
if (options.autoSliceThreshold !== undefined) {
|
|
644
|
+
this.options.autoSliceThreshold = options.autoSliceThreshold;
|
|
645
|
+
}
|
|
646
|
+
this.log(`Auto-slice options updated: enabled=${this.options.autoSliceOnSpeechEnd}, threshold=${this.options.autoSliceThreshold}`);
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
/**
|
|
650
|
+
* Get current statistics
|
|
651
|
+
*/
|
|
652
|
+
getStatistics() {
|
|
653
|
+
return {
|
|
654
|
+
isActive: this.isActive,
|
|
655
|
+
isTranscribing: this.isTranscribing,
|
|
656
|
+
vadEnabled: this.vadEnabled,
|
|
657
|
+
audioStats: {
|
|
658
|
+
isRecording: this.audioStream.isRecording(),
|
|
659
|
+
accumulatedSamples: this.accumulatedData.length
|
|
660
|
+
},
|
|
661
|
+
vadStats: this.vadEnabled ? {
|
|
662
|
+
enabled: true,
|
|
663
|
+
contextAvailable: !!this.vadContext,
|
|
664
|
+
lastSpeechDetectedTime: this.lastSpeechDetectedTime
|
|
665
|
+
} : null,
|
|
666
|
+
sliceStats: this.sliceManager.getCurrentSliceInfo(),
|
|
667
|
+
autoSliceConfig: {
|
|
668
|
+
enabled: this.options.autoSliceOnSpeechEnd,
|
|
669
|
+
threshold: this.options.autoSliceThreshold,
|
|
670
|
+
targetDuration: this.options.audioSliceSec,
|
|
671
|
+
minDuration: this.options.audioMinSec
|
|
672
|
+
}
|
|
673
|
+
};
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Get all transcription results
|
|
678
|
+
*/
|
|
679
|
+
getTranscriptionResults() {
|
|
680
|
+
return Array.from(this.transcriptionResults.values());
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
* Force move to the next slice, finalizing the current one regardless of capacity
|
|
685
|
+
*/
|
|
686
|
+
async nextSlice() {
|
|
687
|
+
var _this$callbacks$onTra3, _this$callbacks8;
|
|
688
|
+
if (!this.isActive) {
|
|
689
|
+
this.log('Cannot force next slice - transcriber is not active');
|
|
690
|
+
return;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// Emit start event to indicate slice processing has started
|
|
694
|
+
const startEvent = {
|
|
695
|
+
type: 'start',
|
|
696
|
+
sliceIndex: -1,
|
|
697
|
+
// Use -1 to indicate forced slice
|
|
698
|
+
data: undefined,
|
|
699
|
+
isCapturing: this.audioStream.isRecording(),
|
|
700
|
+
processTime: 0,
|
|
701
|
+
recordingTime: 0,
|
|
702
|
+
memoryUsage: this.sliceManager.getMemoryUsage()
|
|
703
|
+
};
|
|
704
|
+
(_this$callbacks$onTra3 = (_this$callbacks8 = this.callbacks).onTranscribe) === null || _this$callbacks$onTra3 === void 0 ? void 0 : _this$callbacks$onTra3.call(_this$callbacks8, startEvent);
|
|
705
|
+
|
|
706
|
+
// Check if there are pending transcriptions or currently transcribing
|
|
707
|
+
if (this.isTranscribing || this.transcriptionQueue.length > 0) {
|
|
708
|
+
this.log('Waiting for pending transcriptions to complete before forcing next slice...');
|
|
709
|
+
|
|
710
|
+
// Wait for current transcription queue to be processed
|
|
711
|
+
await this.processTranscriptionQueue();
|
|
712
|
+
}
|
|
713
|
+
const result = this.sliceManager.forceNextSlice();
|
|
714
|
+
if (result.slice) {
|
|
715
|
+
this.log(`Forced slice ${result.slice.index} ready (${result.slice.data.length} bytes)`);
|
|
716
|
+
|
|
717
|
+
// Process VAD for the slice if enabled
|
|
718
|
+
if (!this.isTranscribing && this.vadEnabled) {
|
|
719
|
+
this.processSliceVAD(result.slice).catch(error => {
|
|
720
|
+
this.handleError(`VAD processing error: ${error}`);
|
|
721
|
+
});
|
|
722
|
+
} else if (!this.isTranscribing) {
|
|
723
|
+
// If VAD is disabled, transcribe slices as they become ready
|
|
724
|
+
this.queueSliceForTranscription(result.slice).catch(error => {
|
|
725
|
+
this.handleError(`Failed to queue slice for transcription: ${error}`);
|
|
726
|
+
});
|
|
727
|
+
} else {
|
|
728
|
+
this.log(`Skipping slice ${result.slice.index} - already transcribing`);
|
|
729
|
+
}
|
|
730
|
+
this.emitStatsUpdate('memory_change');
|
|
731
|
+
} else {
|
|
732
|
+
this.log('Forced next slice but no slice data to process');
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/**
|
|
737
|
+
* Reset all components
|
|
738
|
+
*/
|
|
739
|
+
reset() {
|
|
740
|
+
this.sliceManager.reset();
|
|
741
|
+
this.transcriptionQueue = [];
|
|
742
|
+
this.isTranscribing = false;
|
|
743
|
+
this.accumulatedData = new Uint8Array(0);
|
|
744
|
+
|
|
745
|
+
// Reset simplified VAD state
|
|
746
|
+
this.lastSpeechDetectedTime = -1;
|
|
747
|
+
this.lastVadState = 'silence';
|
|
748
|
+
|
|
749
|
+
// Reset stats snapshot for clean start
|
|
750
|
+
this.lastStatsSnapshot = null;
|
|
751
|
+
|
|
752
|
+
// Cancel WAV file writing if in progress
|
|
753
|
+
if (this.wavFileWriter) {
|
|
754
|
+
this.wavFileWriter.cancel().catch(error => {
|
|
755
|
+
this.log(`Failed to cancel WAV file writing: ${error}`);
|
|
756
|
+
});
|
|
757
|
+
this.wavFileWriter = null;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
// Clear transcription results
|
|
761
|
+
this.transcriptionResults.clear();
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/**
|
|
765
|
+
* Release all resources
|
|
766
|
+
*/
|
|
767
|
+
async release() {
|
|
768
|
+
var _this$wavFileWriter;
|
|
769
|
+
if (this.isActive) {
|
|
770
|
+
await this.stop();
|
|
771
|
+
}
|
|
772
|
+
await this.audioStream.release();
|
|
773
|
+
await ((_this$wavFileWriter = this.wavFileWriter) === null || _this$wavFileWriter === void 0 ? void 0 : _this$wavFileWriter.finalize());
|
|
774
|
+
this.vadContext = undefined;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
/**
|
|
778
|
+
* Emit stats update event if stats have changed significantly
|
|
779
|
+
*/
|
|
780
|
+
emitStatsUpdate(eventType) {
|
|
781
|
+
const currentStats = this.getStatistics();
|
|
782
|
+
|
|
783
|
+
// Check if stats have changed significantly
|
|
784
|
+
if (!this.lastStatsSnapshot || RealtimeTranscriber.shouldEmitStatsUpdate(currentStats, this.lastStatsSnapshot)) {
|
|
785
|
+
var _this$callbacks$onSta4, _this$callbacks9;
|
|
786
|
+
const statsEvent = {
|
|
787
|
+
timestamp: Date.now(),
|
|
788
|
+
type: eventType,
|
|
789
|
+
data: currentStats
|
|
790
|
+
};
|
|
791
|
+
(_this$callbacks$onSta4 = (_this$callbacks9 = this.callbacks).onStatsUpdate) === null || _this$callbacks$onSta4 === void 0 ? void 0 : _this$callbacks$onSta4.call(_this$callbacks9, statsEvent);
|
|
792
|
+
this.lastStatsSnapshot = {
|
|
793
|
+
...currentStats
|
|
794
|
+
};
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
/**
|
|
799
|
+
* Determine if stats update should be emitted
|
|
800
|
+
*/
|
|
801
|
+
static shouldEmitStatsUpdate(current, previous) {
|
|
802
|
+
var _current$sliceStats, _current$sliceStats$m, _previous$sliceStats, _previous$sliceStats$;
|
|
803
|
+
// Always emit on status changes
|
|
804
|
+
if (current.isActive !== previous.isActive || current.isTranscribing !== previous.isTranscribing) {
|
|
805
|
+
return true;
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// Emit on significant memory changes (>10% or >5MB)
|
|
809
|
+
const currentMemory = ((_current$sliceStats = current.sliceStats) === null || _current$sliceStats === void 0 ? void 0 : (_current$sliceStats$m = _current$sliceStats.memoryUsage) === null || _current$sliceStats$m === void 0 ? void 0 : _current$sliceStats$m.estimatedMB) || 0;
|
|
810
|
+
const previousMemory = ((_previous$sliceStats = previous.sliceStats) === null || _previous$sliceStats === void 0 ? void 0 : (_previous$sliceStats$ = _previous$sliceStats.memoryUsage) === null || _previous$sliceStats$ === void 0 ? void 0 : _previous$sliceStats$.estimatedMB) || 0;
|
|
811
|
+
const memoryDiff = Math.abs(currentMemory - previousMemory);
|
|
812
|
+
if (memoryDiff > 5 || previousMemory > 0 && memoryDiff / previousMemory > 0.1) {
|
|
813
|
+
return true;
|
|
814
|
+
}
|
|
815
|
+
return false;
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
/**
|
|
819
|
+
* Logger function
|
|
820
|
+
*/
|
|
821
|
+
log(message) {
|
|
822
|
+
this.options.logger(`[RealtimeTranscriber] ${message}`);
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
//# sourceMappingURL=RealtimeTranscriber.js.map
|