webtalk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitattributes +35 -0
- package/.github/workflows/publish.yml +26 -0
- package/README.md +1 -0
- package/app.html +519 -0
- package/assets/index-ClpvH5Vn.js +40 -0
- package/assets/index-DUYekU7u.css +1 -0
- package/assets/worker-BPxxCWVT.js +2679 -0
- package/config.js +36 -0
- package/debug.js +21 -0
- package/download-lock.js +26 -0
- package/hot-reload.js +78 -0
- package/middleware.js +62 -0
- package/package.json +33 -0
- package/persistent-state.js +62 -0
- package/sdk.js +22 -0
- package/serve-static.js +45 -0
- package/server.js +177 -0
- package/setup-npm-publishing.sh +140 -0
- package/stt.js +141 -0
- package/test.mp3 +0 -0
- package/tts/EventEmitter.js +59 -0
- package/tts/PCMPlayerWorklet.js +563 -0
- package/tts/inference-worker.js +1121 -0
- package/tts/onnx-streaming.js +721 -0
- package/tts-models.js +97 -0
- package/tts-utils.js +52 -0
- package/tts.js +167 -0
- package/whisper-models.js +161 -0
- package/worker-patch.js +32 -0
|
@@ -0,0 +1,721 @@
|
|
|
1
|
+
|
|
2
|
+
import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
|
|
3
|
+
|
|
4
|
+
// Configuration
|
|
5
|
+
const SAMPLE_RATE = 24000;
|
|
6
|
+
const FADE_SAMPLES = 480; // 20ms fade at 24kHz
|
|
7
|
+
const REALTIME_THRESHOLD = 1.0;
|
|
8
|
+
|
|
9
|
+
export class PocketTTSStreaming {
|
|
10
|
+
constructor() {
|
|
11
|
+
this.worker = null;
|
|
12
|
+
this.player = null;
|
|
13
|
+
this.audioContext = null;
|
|
14
|
+
this.isGenerating = false;
|
|
15
|
+
this.isWorkerReady = false;
|
|
16
|
+
this.pendingGeneration = false;
|
|
17
|
+
|
|
18
|
+
// Voice state
|
|
19
|
+
this.availableVoices = [];
|
|
20
|
+
this.currentVoice = null;
|
|
21
|
+
this.customVoiceAudio = null;
|
|
22
|
+
|
|
23
|
+
// Metrics State
|
|
24
|
+
this.generationStartTime = 0;
|
|
25
|
+
this.lastChunkFinishTime = 0;
|
|
26
|
+
this.rtfMovingAverage = 0;
|
|
27
|
+
this.skipNextRtf = false;
|
|
28
|
+
|
|
29
|
+
// Edge optimization state (dynamic LSD)
|
|
30
|
+
this.edgeOptimizationApplied = false;
|
|
31
|
+
this.playbackMode = 'pending'; // pending | stream | buffer_all
|
|
32
|
+
this.bufferedChunks = [];
|
|
33
|
+
this.deferStreamEnd = false;
|
|
34
|
+
|
|
35
|
+
this.elements = {
|
|
36
|
+
textInput: document.getElementById('text-input'),
|
|
37
|
+
generateBtn: document.getElementById('generate-btn'),
|
|
38
|
+
stopBtn: document.getElementById('stop-btn'),
|
|
39
|
+
statusText: document.getElementById('stat-status'),
|
|
40
|
+
statusIndicator: document.getElementById('status-indicator'),
|
|
41
|
+
modelStatusIcon: document.querySelector('#model-status .model-status__dot'),
|
|
42
|
+
modelStatusText: document.querySelector('#model-status .model-status__text'),
|
|
43
|
+
btnLoader: document.getElementById('btn-loader'),
|
|
44
|
+
statTTFB: document.getElementById('stat-ttfb'),
|
|
45
|
+
statRTFx: document.getElementById('stat-rtfx'),
|
|
46
|
+
ttfbBar: document.getElementById('ttfb-bar'),
|
|
47
|
+
rtfxContext: document.getElementById('rtfx-context'),
|
|
48
|
+
edgeOptNote: document.getElementById('edge-opt-note'),
|
|
49
|
+
fullGenNote: document.getElementById('full-gen-note'),
|
|
50
|
+
voiceSelect: document.getElementById('voice-select'),
|
|
51
|
+
voiceUpload: document.getElementById('voice-upload'),
|
|
52
|
+
voiceUploadBtn: document.getElementById('voice-upload-btn'),
|
|
53
|
+
voiceUploadStatus: document.getElementById('voice-upload-status')
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
this.attachEventListeners();
|
|
57
|
+
this.init();
|
|
58
|
+
this.setupVisualization();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async init() {
|
|
62
|
+
console.log('Pocket TTS v1.0 - Web Demo');
|
|
63
|
+
console.log('Secure context:', window.isSecureContext);
|
|
64
|
+
console.log('Location:', window.location.href);
|
|
65
|
+
this.updateStatus('Initializing...', 'running');
|
|
66
|
+
|
|
67
|
+
// Initial button state
|
|
68
|
+
this.elements.generateBtn.disabled = true;
|
|
69
|
+
if (this.elements.voiceUploadBtn) this.elements.voiceUploadBtn.disabled = true;
|
|
70
|
+
const btnText = this.elements.generateBtn.querySelector('.btn__text');
|
|
71
|
+
if (btnText) btnText.textContent = 'Loading Models...';
|
|
72
|
+
this.elements.btnLoader.style.display = 'block';
|
|
73
|
+
|
|
74
|
+
// Check secure context
|
|
75
|
+
if (!window.isSecureContext) {
|
|
76
|
+
const msg = 'AudioWorklet requires HTTPS or localhost. Current: ' + window.location.hostname;
|
|
77
|
+
console.error(msg);
|
|
78
|
+
this.updateStatus(msg, 'error');
|
|
79
|
+
this.elements.btnLoader.style.display = 'none';
|
|
80
|
+
if (btnText) btnText.textContent = 'Secure Context Required';
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
// Initialize Audio Context and Player
|
|
86
|
+
this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
|
|
87
|
+
sampleRate: SAMPLE_RATE,
|
|
88
|
+
latencyHint: 'interactive'
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
// Check if AudioWorklet is supported
|
|
92
|
+
if (!this.audioContext.audioWorklet) {
|
|
93
|
+
throw new Error('AudioWorklet not supported in this browser.');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
await this.audioContext.audioWorklet.addModule('/tts/PCMPlayerWorklet.js');
|
|
97
|
+
this.player = new PCMPlayer(this.audioContext);
|
|
98
|
+
this.player.addEventListener('audioEnded', () => {
|
|
99
|
+
if (this.deferStreamEnd) {
|
|
100
|
+
this.deferStreamEnd = false;
|
|
101
|
+
this.finalizePlayback();
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
} catch (err) {
|
|
105
|
+
console.error('Audio initialization failed:', err);
|
|
106
|
+
this.updateStatus('Audio init failed: ' + err.message, 'error');
|
|
107
|
+
this.elements.btnLoader.style.display = 'none';
|
|
108
|
+
if (btnText) btnText.textContent = 'Audio Error';
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Initialize Worker (as ES module)
|
|
113
|
+
console.log('Spawning Inference Worker...');
|
|
114
|
+
this.worker = new Worker('./inference-worker.js?v=14', { type: 'module' });
|
|
115
|
+
|
|
116
|
+
this.worker.onmessage = (e) => {
|
|
117
|
+
const { type, data, error, status, state, metrics, text, voices, defaultVoice, voiceName } = e.data;
|
|
118
|
+
|
|
119
|
+
switch (type) {
|
|
120
|
+
case 'status':
|
|
121
|
+
this.updateStatus(status, state);
|
|
122
|
+
break;
|
|
123
|
+
case 'model_status':
|
|
124
|
+
this.updateModelStatus(status, text);
|
|
125
|
+
break;
|
|
126
|
+
case 'voices_loaded':
|
|
127
|
+
this.handleVoicesLoaded(voices, defaultVoice);
|
|
128
|
+
break;
|
|
129
|
+
case 'voice_encoded':
|
|
130
|
+
this.handleVoiceEncoded(voiceName);
|
|
131
|
+
break;
|
|
132
|
+
case 'voice_set':
|
|
133
|
+
this.currentVoice = voiceName;
|
|
134
|
+
break;
|
|
135
|
+
case 'loaded':
|
|
136
|
+
console.log('Worker confirmed models loaded.');
|
|
137
|
+
this.isWorkerReady = true;
|
|
138
|
+
this.elements.generateBtn.disabled = false;
|
|
139
|
+
if (this.elements.voiceUploadBtn) this.elements.voiceUploadBtn.disabled = false;
|
|
140
|
+
this.elements.btnLoader.style.display = 'none';
|
|
141
|
+
const loadedBtnText = this.elements.generateBtn.querySelector('.btn__text');
|
|
142
|
+
if (loadedBtnText) loadedBtnText.textContent = 'Generate Audio';
|
|
143
|
+
|
|
144
|
+
if (this.pendingGeneration) {
|
|
145
|
+
this.pendingGeneration = false;
|
|
146
|
+
this.startGeneration();
|
|
147
|
+
}
|
|
148
|
+
break;
|
|
149
|
+
case 'generation_started':
|
|
150
|
+
// The main thread already sets this in startGeneration for better precision
|
|
151
|
+
break;
|
|
152
|
+
case 'audio_chunk':
|
|
153
|
+
this.handleAudioChunk(data, metrics);
|
|
154
|
+
break;
|
|
155
|
+
case 'stream_ended':
|
|
156
|
+
this.handleStreamEnd();
|
|
157
|
+
break;
|
|
158
|
+
case 'error':
|
|
159
|
+
console.error('Worker Error:', error);
|
|
160
|
+
this.updateStatus(`Error: ${error}`, 'error');
|
|
161
|
+
this.resetUI();
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
// Trigger Model Load in Worker
|
|
167
|
+
this.worker.postMessage({ type: 'load' });
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
handleVoicesLoaded(voices, defaultVoice) {
|
|
171
|
+
this.availableVoices = voices;
|
|
172
|
+
this.currentVoice = defaultVoice;
|
|
173
|
+
|
|
174
|
+
// Populate voice selector
|
|
175
|
+
if (this.elements.voiceSelect) {
|
|
176
|
+
this.elements.voiceSelect.innerHTML = '';
|
|
177
|
+
|
|
178
|
+
// Add predefined voices
|
|
179
|
+
for (const voice of voices) {
|
|
180
|
+
const option = document.createElement('option');
|
|
181
|
+
option.value = voice;
|
|
182
|
+
option.textContent = voice.charAt(0).toUpperCase() + voice.slice(1);
|
|
183
|
+
if (voice === defaultVoice) {
|
|
184
|
+
option.selected = true;
|
|
185
|
+
}
|
|
186
|
+
this.elements.voiceSelect.appendChild(option);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Add custom voice option
|
|
190
|
+
const customOption = document.createElement('option');
|
|
191
|
+
customOption.value = 'custom';
|
|
192
|
+
customOption.textContent = 'Custom (Upload)';
|
|
193
|
+
this.elements.voiceSelect.appendChild(customOption);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
console.log('Available voices:', voices, 'Default:', defaultVoice);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
handleVoiceEncoded(voiceName) {
|
|
200
|
+
this.currentVoice = voiceName;
|
|
201
|
+
if (this.elements.voiceUploadStatus) {
|
|
202
|
+
this.elements.voiceUploadStatus.textContent = 'Voice encoded successfully!';
|
|
203
|
+
this.elements.voiceUploadStatus.className = 'voice-upload-status success';
|
|
204
|
+
}
|
|
205
|
+
// Set the select to custom
|
|
206
|
+
if (this.elements.voiceSelect) {
|
|
207
|
+
this.elements.voiceSelect.value = 'custom';
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async handleVoiceUpload(file) {
|
|
212
|
+
if (!file) return;
|
|
213
|
+
|
|
214
|
+
if (this.elements.voiceUploadStatus) {
|
|
215
|
+
this.elements.voiceUploadStatus.textContent = 'Processing audio...';
|
|
216
|
+
this.elements.voiceUploadStatus.className = 'voice-upload-status';
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
// Decode audio file
|
|
221
|
+
const arrayBuffer = await file.arrayBuffer();
|
|
222
|
+
const audioBuffer = await this.audioContext.decodeAudioData(arrayBuffer);
|
|
223
|
+
|
|
224
|
+
// Resample to 24kHz if needed
|
|
225
|
+
let audioData;
|
|
226
|
+
if (audioBuffer.sampleRate !== SAMPLE_RATE) {
|
|
227
|
+
audioData = this.resampleAudio(audioBuffer, SAMPLE_RATE);
|
|
228
|
+
} else {
|
|
229
|
+
audioData = audioBuffer.getChannelData(0);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Convert to mono if stereo
|
|
233
|
+
if (audioBuffer.numberOfChannels > 1 && audioBuffer.sampleRate === SAMPLE_RATE) {
|
|
234
|
+
const left = audioBuffer.getChannelData(0);
|
|
235
|
+
const right = audioBuffer.getChannelData(1);
|
|
236
|
+
audioData = new Float32Array(left.length);
|
|
237
|
+
for (let i = 0; i < left.length; i++) {
|
|
238
|
+
audioData[i] = (left[i] + right[i]) / 2;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Limit to 10 seconds max
|
|
243
|
+
const maxSamples = SAMPLE_RATE * 10;
|
|
244
|
+
if (audioData.length > maxSamples) {
|
|
245
|
+
audioData = audioData.slice(0, maxSamples);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Send to worker for encoding
|
|
249
|
+
this.worker.postMessage({
|
|
250
|
+
type: 'encode_voice',
|
|
251
|
+
data: { audio: audioData }
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
} catch (err) {
|
|
255
|
+
console.error('Voice upload error:', err);
|
|
256
|
+
if (this.elements.voiceUploadStatus) {
|
|
257
|
+
this.elements.voiceUploadStatus.textContent = `Error: ${err.message}`;
|
|
258
|
+
this.elements.voiceUploadStatus.className = 'voice-upload-status error';
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
resampleAudio(audioBuffer, targetRate) {
|
|
264
|
+
const sourceRate = audioBuffer.sampleRate;
|
|
265
|
+
const sourceData = audioBuffer.getChannelData(0);
|
|
266
|
+
|
|
267
|
+
// If stereo, mix to mono
|
|
268
|
+
let monoData = sourceData;
|
|
269
|
+
if (audioBuffer.numberOfChannels > 1) {
|
|
270
|
+
const right = audioBuffer.getChannelData(1);
|
|
271
|
+
monoData = new Float32Array(sourceData.length);
|
|
272
|
+
for (let i = 0; i < sourceData.length; i++) {
|
|
273
|
+
monoData[i] = (sourceData[i] + right[i]) / 2;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Linear interpolation resampling
|
|
278
|
+
const ratio = sourceRate / targetRate;
|
|
279
|
+
const outputLength = Math.floor(monoData.length / ratio);
|
|
280
|
+
const output = new Float32Array(outputLength);
|
|
281
|
+
|
|
282
|
+
for (let i = 0; i < outputLength; i++) {
|
|
283
|
+
const srcIndex = i * ratio;
|
|
284
|
+
const srcIndexFloor = Math.floor(srcIndex);
|
|
285
|
+
const srcIndexCeil = Math.min(srcIndexFloor + 1, monoData.length - 1);
|
|
286
|
+
const t = srcIndex - srcIndexFloor;
|
|
287
|
+
output[i] = monoData[srcIndexFloor] * (1 - t) + monoData[srcIndexCeil] * t;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return output;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
attachEventListeners() {
|
|
294
|
+
this.elements.generateBtn.addEventListener('click', () => this.startGeneration());
|
|
295
|
+
this.elements.stopBtn.addEventListener('click', () => this.stopGeneration());
|
|
296
|
+
|
|
297
|
+
// Voice selector
|
|
298
|
+
if (this.elements.voiceSelect) {
|
|
299
|
+
this.elements.voiceSelect.addEventListener('change', (e) => {
|
|
300
|
+
const voice = e.target.value;
|
|
301
|
+
if (voice === 'custom') {
|
|
302
|
+
// Trigger file upload
|
|
303
|
+
if (this.elements.voiceUpload) {
|
|
304
|
+
this.elements.voiceUpload.click();
|
|
305
|
+
}
|
|
306
|
+
} else {
|
|
307
|
+
this.worker.postMessage({
|
|
308
|
+
type: 'set_voice',
|
|
309
|
+
data: { voiceName: voice }
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Voice file upload
|
|
316
|
+
if (this.elements.voiceUpload) {
|
|
317
|
+
this.elements.voiceUpload.addEventListener('change', (e) => {
|
|
318
|
+
const file = e.target.files[0];
|
|
319
|
+
if (file) {
|
|
320
|
+
this.handleVoiceUpload(file);
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Voice upload button
|
|
326
|
+
if (this.elements.voiceUploadBtn) {
|
|
327
|
+
this.elements.voiceUploadBtn.addEventListener('click', () => {
|
|
328
|
+
if (this.elements.voiceUpload) {
|
|
329
|
+
this.elements.voiceUpload.click();
|
|
330
|
+
}
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Sample buttons
|
|
335
|
+
document.querySelectorAll('.sample-btn').forEach(btn => {
|
|
336
|
+
btn.addEventListener('click', () => {
|
|
337
|
+
this.elements.textInput.value = btn.getAttribute('data-text');
|
|
338
|
+
// Trigger input event to update character count
|
|
339
|
+
this.elements.textInput.dispatchEvent(new Event('input'));
|
|
340
|
+
});
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
// Character count
|
|
344
|
+
this.elements.textInput.addEventListener('input', () => {
|
|
345
|
+
const count = this.elements.textInput.value.length;
|
|
346
|
+
const countEl = document.getElementById('char-count');
|
|
347
|
+
if (countEl) countEl.textContent = count;
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
this.elements.textInput.addEventListener('keydown', (e) => {
|
|
351
|
+
if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
|
|
352
|
+
this.startGeneration();
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
async startGeneration() {
|
|
358
|
+
this.generationStartTime = performance.now();
|
|
359
|
+
try {
|
|
360
|
+
if (!this.isWorkerReady) {
|
|
361
|
+
this.pendingGeneration = true;
|
|
362
|
+
const btnText = this.elements.generateBtn.querySelector('.btn__text');
|
|
363
|
+
if (btnText) btnText.textContent = 'Starting soon...';
|
|
364
|
+
return;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (this.isGenerating) return;
|
|
368
|
+
|
|
369
|
+
if (this.audioContext && this.audioContext.state === 'suspended') {
|
|
370
|
+
await this.audioContext.resume();
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const text = this.elements.textInput.value.trim();
|
|
374
|
+
if (!text) return;
|
|
375
|
+
|
|
376
|
+
this.isGenerating = true;
|
|
377
|
+
this.elements.generateBtn.disabled = true;
|
|
378
|
+
this.elements.generateBtn.classList.add('btn--generating');
|
|
379
|
+
this.elements.stopBtn.disabled = false;
|
|
380
|
+
|
|
381
|
+
if (this.player) this.player.reset();
|
|
382
|
+
|
|
383
|
+
// Reset metrics
|
|
384
|
+
this.elements.statTTFB.textContent = '--';
|
|
385
|
+
this.elements.statRTFx.textContent = '--';
|
|
386
|
+
if (this.elements.ttfbBar) this.elements.ttfbBar.style.width = '0%';
|
|
387
|
+
if (this.elements.edgeOptNote) this.elements.edgeOptNote.style.display = 'none';
|
|
388
|
+
if (this.elements.fullGenNote) this.elements.fullGenNote.style.display = 'none';
|
|
389
|
+
|
|
390
|
+
this.rtfMovingAverage = 0;
|
|
391
|
+
this.edgeOptimizationApplied = false;
|
|
392
|
+
this.lastChunkFinishTime = 0;
|
|
393
|
+
this.skipNextRtf = false;
|
|
394
|
+
this.playbackMode = 'pending';
|
|
395
|
+
this.bufferedChunks = [];
|
|
396
|
+
this.deferStreamEnd = false;
|
|
397
|
+
|
|
398
|
+
// Get current voice from selector
|
|
399
|
+
const voice = this.elements.voiceSelect ? this.elements.voiceSelect.value : this.currentVoice;
|
|
400
|
+
|
|
401
|
+
this.worker.postMessage({
|
|
402
|
+
type: 'generate',
|
|
403
|
+
data: { text, voice }
|
|
404
|
+
});
|
|
405
|
+
} catch (err) {
|
|
406
|
+
console.error('Error in startGeneration:', err);
|
|
407
|
+
this.updateStatus(`Error: ${err.message}`, 'error');
|
|
408
|
+
this.isGenerating = false;
|
|
409
|
+
this.resetUI();
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
stopGeneration() {
|
|
414
|
+
if (!this.isGenerating) return;
|
|
415
|
+
this.worker.postMessage({ type: 'stop' });
|
|
416
|
+
// Handle stop immediately in UI
|
|
417
|
+
this.handleStreamEnd();
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
applyFadeIn(audioData) {
|
|
421
|
+
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
|
422
|
+
for (let i = 0; i < fadeLen; i++) {
|
|
423
|
+
audioData[i] *= i / fadeLen;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
applyFadeOut(audioData) {
|
|
428
|
+
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
|
429
|
+
const startIdx = audioData.length - fadeLen;
|
|
430
|
+
for (let i = 0; i < fadeLen; i++) {
|
|
431
|
+
audioData[startIdx + i] *= 1 - (i / fadeLen);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
bufferOrPlay(audioData) {
|
|
436
|
+
if (this.playbackMode === 'stream') {
|
|
437
|
+
this.player.playAudio(audioData);
|
|
438
|
+
} else {
|
|
439
|
+
this.bufferedChunks.push(audioData);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
flushBufferedAudio() {
|
|
444
|
+
if (!this.bufferedChunks.length) return;
|
|
445
|
+
for (const chunk of this.bufferedChunks) {
|
|
446
|
+
this.player.playAudio(chunk);
|
|
447
|
+
}
|
|
448
|
+
this.bufferedChunks = [];
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
switchToStream() {
|
|
452
|
+
this.playbackMode = 'stream';
|
|
453
|
+
if (this.elements.fullGenNote) this.elements.fullGenNote.style.display = 'none';
|
|
454
|
+
this.flushBufferedAudio();
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
switchToBufferAll() {
|
|
458
|
+
this.playbackMode = 'buffer_all';
|
|
459
|
+
if (this.elements.fullGenNote) this.elements.fullGenNote.style.display = 'block';
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
finalizePlayback() {
|
|
463
|
+
this.resetUI();
|
|
464
|
+
this.isGenerating = false;
|
|
465
|
+
if (this.elements.fullGenNote) this.elements.fullGenNote.style.display = 'none';
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
handleAudioChunk(audioData, metrics) {
|
|
469
|
+
if (!this.isGenerating) return;
|
|
470
|
+
|
|
471
|
+
if (metrics.isSilence) {
|
|
472
|
+
this.bufferOrPlay(audioData);
|
|
473
|
+
this.skipNextRtf = true;
|
|
474
|
+
return;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Apply fade-in at the start of each text chunk
|
|
478
|
+
if (metrics.isFirst || metrics.chunkStart) this.applyFadeIn(audioData);
|
|
479
|
+
if (metrics.isLast) this.applyFadeOut(audioData);
|
|
480
|
+
|
|
481
|
+
// Calculate RTFx immediately (not in RAF) so edge optimization triggers fast
|
|
482
|
+
const now = performance.now();
|
|
483
|
+
let ttfb = 0;
|
|
484
|
+
let instantaneousRTF = 0;
|
|
485
|
+
let arrivalRTF = 0;
|
|
486
|
+
|
|
487
|
+
if (metrics.isFirst) {
|
|
488
|
+
ttfb = now - this.generationStartTime;
|
|
489
|
+
this.lastChunkFinishTime = now;
|
|
490
|
+
} else if (this.skipNextRtf) {
|
|
491
|
+
this.lastChunkFinishTime = now;
|
|
492
|
+
this.skipNextRtf = false;
|
|
493
|
+
} else if (this.lastChunkFinishTime > 0) {
|
|
494
|
+
const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
|
|
495
|
+
this.lastChunkFinishTime = now;
|
|
496
|
+
|
|
497
|
+
if (timeSinceLastChunk > 0) {
|
|
498
|
+
arrivalRTF = metrics.chunkDuration / timeSinceLastChunk;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Prefer actual generation time when available (avoids TTFB skew)
|
|
503
|
+
if (metrics.genTimeSec && metrics.genTimeSec > 0) {
|
|
504
|
+
instantaneousRTF = metrics.chunkDuration / metrics.genTimeSec;
|
|
505
|
+
} else if (arrivalRTF > 0) {
|
|
506
|
+
instantaneousRTF = arrivalRTF;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (instantaneousRTF > 0) {
|
|
510
|
+
if (this.rtfMovingAverage === 0) {
|
|
511
|
+
this.rtfMovingAverage = instantaneousRTF;
|
|
512
|
+
} else {
|
|
513
|
+
this.rtfMovingAverage = this.rtfMovingAverage * 0.8 + instantaneousRTF * 0.2;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
const edgeRtf = arrivalRTF > 0 ? arrivalRTF : instantaneousRTF;
|
|
517
|
+
if (!this.edgeOptimizationApplied && edgeRtf < REALTIME_THRESHOLD) {
|
|
518
|
+
this.edgeOptimizationApplied = true;
|
|
519
|
+
this.worker.postMessage({ type: 'set_lsd', data: { lsd: 1 } });
|
|
520
|
+
console.log('Edge optimization applied: LSD reduced to 1');
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if (this.playbackMode === 'pending') {
|
|
525
|
+
if (instantaneousRTF >= REALTIME_THRESHOLD) {
|
|
526
|
+
this.switchToStream();
|
|
527
|
+
} else if (!metrics.isFirst && this.edgeOptimizationApplied && instantaneousRTF < REALTIME_THRESHOLD) {
|
|
528
|
+
this.switchToBufferAll();
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
this.bufferOrPlay(audioData);
|
|
533
|
+
|
|
534
|
+
// Update UI in RAF (non-blocking)
|
|
535
|
+
const rtfxToDisplay = this.rtfMovingAverage;
|
|
536
|
+
const showEdgeOpt = this.edgeOptimizationApplied;
|
|
537
|
+
requestAnimationFrame(() => {
|
|
538
|
+
if (metrics.isFirst) {
|
|
539
|
+
this.updateTTFB(ttfb);
|
|
540
|
+
}
|
|
541
|
+
if (rtfxToDisplay > 0) {
|
|
542
|
+
this.updateRTFx(rtfxToDisplay);
|
|
543
|
+
}
|
|
544
|
+
if (showEdgeOpt && this.elements.edgeOptNote) {
|
|
545
|
+
this.elements.edgeOptNote.style.display = 'block';
|
|
546
|
+
}
|
|
547
|
+
});
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
handleStreamEnd() {
|
|
551
|
+
if (this.playbackMode === 'pending') {
|
|
552
|
+
this.switchToBufferAll();
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
if (this.playbackMode === 'buffer_all') {
|
|
556
|
+
this.flushBufferedAudio();
|
|
557
|
+
if (this.player.notifyStreamEnded) this.player.notifyStreamEnded();
|
|
558
|
+
this.deferStreamEnd = true;
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if (this.player.notifyStreamEnded) this.player.notifyStreamEnded();
|
|
563
|
+
this.finalizePlayback();
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
resetUI() {
|
|
567
|
+
this.elements.generateBtn.disabled = false;
|
|
568
|
+
this.elements.generateBtn.classList.remove('btn--generating');
|
|
569
|
+
const btnText = this.elements.generateBtn.querySelector('.btn__text');
|
|
570
|
+
if (btnText) btnText.textContent = 'Generate Audio';
|
|
571
|
+
this.elements.stopBtn.disabled = true;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
updateStatus(text, state) {
|
|
575
|
+
this.elements.statusText.textContent = text;
|
|
576
|
+
this.elements.statusIndicator.className = `status-indicator status-${state}`;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
updateModelStatus(state, text) {
|
|
580
|
+
this.elements.modelStatusText.textContent = text;
|
|
581
|
+
this.elements.modelStatusIcon.className = `status-icon status-${state}`;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
updateTTFB(ms) {
|
|
585
|
+
this.elements.statTTFB.textContent = Math.round(ms);
|
|
586
|
+
const percentage = Math.min((ms / 2000) * 100, 100);
|
|
587
|
+
this.elements.ttfbBar.style.width = `${percentage}%`;
|
|
588
|
+
this.elements.ttfbBar.style.background = ms < 500 ? '#00d4aa' : ms < 1000 ? '#ffd93d' : '#ff6b6b';
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
updateRTFx(val) {
|
|
592
|
+
this.elements.statRTFx.textContent = `${val.toFixed(2)}x`;
|
|
593
|
+
this.elements.rtfxContext.style.color = val >= 1.0 ? '#00d4aa' : '#ff6b6b';
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// -------------------------------------------------------------------------
|
|
597
|
+
// Visualization
|
|
598
|
+
// -------------------------------------------------------------------------
|
|
599
|
+
setupVisualization() {
|
|
600
|
+
this.waveformCanvas = document.getElementById('visualizer-waveform');
|
|
601
|
+
this.barsCanvas = document.getElementById('visualizer-bars');
|
|
602
|
+
if (!this.waveformCanvas || !this.barsCanvas) return;
|
|
603
|
+
|
|
604
|
+
this.waveformCtx = this.waveformCanvas.getContext('2d');
|
|
605
|
+
this.barsCtx = this.barsCanvas.getContext('2d');
|
|
606
|
+
|
|
607
|
+
// Initial resize
|
|
608
|
+
this.resizeCanvases();
|
|
609
|
+
window.addEventListener('resize', () => this.resizeCanvases());
|
|
610
|
+
|
|
611
|
+
// Start animation loop
|
|
612
|
+
requestAnimationFrame(() => this.draw());
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
resizeCanvases() {
|
|
616
|
+
if (!this.waveformCanvas || !this.barsCanvas) return;
|
|
617
|
+
|
|
618
|
+
const parent = this.waveformCanvas.parentElement;
|
|
619
|
+
const width = parent.clientWidth;
|
|
620
|
+
const height = parent.clientHeight;
|
|
621
|
+
|
|
622
|
+
const dpr = window.devicePixelRatio || 1;
|
|
623
|
+
|
|
624
|
+
[this.waveformCanvas, this.barsCanvas].forEach(canvas => {
|
|
625
|
+
canvas.width = width * dpr;
|
|
626
|
+
canvas.height = height * dpr;
|
|
627
|
+
canvas.style.width = `${width}px`;
|
|
628
|
+
canvas.style.height = `${height}px`;
|
|
629
|
+
const ctx = canvas.getContext('2d');
|
|
630
|
+
ctx.scale(dpr, dpr);
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
draw() {
|
|
635
|
+
requestAnimationFrame(() => this.draw());
|
|
636
|
+
|
|
637
|
+
if (!this.player || !this.player.analyser) return;
|
|
638
|
+
|
|
639
|
+
const bufferLength = this.player.analyser.frequencyBinCount;
|
|
640
|
+
const dataArray = new Uint8Array(bufferLength);
|
|
641
|
+
|
|
642
|
+
// Draw Bars (Frequency)
|
|
643
|
+
this.player.analyser.getByteFrequencyData(dataArray);
|
|
644
|
+
this.drawBars(dataArray);
|
|
645
|
+
|
|
646
|
+
// Draw Waveform (Time Domain)
|
|
647
|
+
this.player.analyser.getByteTimeDomainData(dataArray);
|
|
648
|
+
this.drawWaveform(dataArray);
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
drawWaveform(dataArray) {
|
|
652
|
+
const ctx = this.waveformCtx;
|
|
653
|
+
const canvas = this.waveformCanvas;
|
|
654
|
+
const width = canvas.width / (window.devicePixelRatio || 1);
|
|
655
|
+
const height = canvas.height / (window.devicePixelRatio || 1);
|
|
656
|
+
|
|
657
|
+
ctx.clearRect(0, 0, width, height);
|
|
658
|
+
ctx.lineWidth = 2;
|
|
659
|
+
ctx.strokeStyle = '#00d4aa'; // Mint primary
|
|
660
|
+
ctx.beginPath();
|
|
661
|
+
|
|
662
|
+
const sliceWidth = width / dataArray.length;
|
|
663
|
+
let x = 0;
|
|
664
|
+
|
|
665
|
+
for (let i = 0; i < dataArray.length; i++) {
|
|
666
|
+
const v = dataArray[i] / 128.0;
|
|
667
|
+
const y = (v * height) / 2;
|
|
668
|
+
|
|
669
|
+
if (i === 0) ctx.moveTo(x, y);
|
|
670
|
+
else ctx.lineTo(x, y);
|
|
671
|
+
|
|
672
|
+
x += sliceWidth;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
ctx.lineTo(width, height / 2);
|
|
676
|
+
ctx.stroke();
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
drawBars(dataArray) {
|
|
680
|
+
const ctx = this.barsCtx;
|
|
681
|
+
const canvas = this.barsCanvas;
|
|
682
|
+
const width = canvas.width / (window.devicePixelRatio || 1);
|
|
683
|
+
const height = canvas.height / (window.devicePixelRatio || 1);
|
|
684
|
+
|
|
685
|
+
ctx.clearRect(0, 0, width, height);
|
|
686
|
+
|
|
687
|
+
const barCount = 120; // Number of bars to display
|
|
688
|
+
const barWidth = (width / barCount);
|
|
689
|
+
const samplesPerBar = Math.floor(dataArray.length / barCount);
|
|
690
|
+
|
|
691
|
+
for (let i = 0; i < barCount; i++) {
|
|
692
|
+
let sum = 0;
|
|
693
|
+
for (let j = 0; j < samplesPerBar; j++) {
|
|
694
|
+
sum += dataArray[i * samplesPerBar + j];
|
|
695
|
+
}
|
|
696
|
+
const average = sum / samplesPerBar;
|
|
697
|
+
const barHeight = (average / 255) * height * 0.8;
|
|
698
|
+
|
|
699
|
+
// Gradient for bar - Mint spectrum
|
|
700
|
+
const gradient = ctx.createLinearGradient(0, height, 0, height - barHeight);
|
|
701
|
+
gradient.addColorStop(0, '#3eb48944');
|
|
702
|
+
gradient.addColorStop(1, '#7fffd4cc');
|
|
703
|
+
|
|
704
|
+
ctx.fillStyle = gradient;
|
|
705
|
+
|
|
706
|
+
// Rounded bars
|
|
707
|
+
const x = i * barWidth;
|
|
708
|
+
const y = height - barHeight;
|
|
709
|
+
const radius = barWidth / 2;
|
|
710
|
+
|
|
711
|
+
ctx.beginPath();
|
|
712
|
+
ctx.roundRect(x + 1, y, barWidth - 2, barHeight, [2, 2, 0, 0]);
|
|
713
|
+
ctx.fill();
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Start the app
|
|
719
|
+
document.addEventListener('DOMContentLoaded', () => {
|
|
720
|
+
window.app = new PocketTTSStreaming();
|
|
721
|
+
});
|