openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Soundboard — extracted from index.html DJSoundboard (P3-T6)
|
|
3
|
+
*
|
|
4
|
+
* Manages a library of DJ sound effects with preloading, debounce,
|
|
5
|
+
* and AI text-trigger detection.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* import { Soundboard } from './Soundboard.js';
|
|
9
|
+
* const board = new Soundboard({ serverUrl: 'http://localhost:5000' });
|
|
10
|
+
* board.init();
|
|
11
|
+
* window.djSoundboard = board;
|
|
12
|
+
*
|
|
13
|
+
* EventBus events emitted (optional):
|
|
14
|
+
* 'soundboard:play' { soundName }
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export class Soundboard {
|
|
18
|
+
constructor({ serverUrl = '', eventBus = null } = {}) {
|
|
19
|
+
this.serverUrl = serverUrl;
|
|
20
|
+
this._eventBus = eventBus;
|
|
21
|
+
|
|
22
|
+
this.sounds = {
|
|
23
|
+
'air_horn': { file: 'air_horn.mp3', triggers: ['air horn', 'airhorn', 'horn', 'bwaaah', 'bwaaa', 'bwah'] },
|
|
24
|
+
'scratch_long': { file: 'scratch_long.mp3', triggers: ['scratch', 'scratching', 'wicka', 'wikka'] },
|
|
25
|
+
'rewind': { file: 'rewind.mp3', triggers: ['rewind', 'pull up', 'pull it back', 'hold up', 'bring it back'] },
|
|
26
|
+
'record_stop': { file: 'record_stop.mp3', triggers: ['record stop', 'stop the record'] },
|
|
27
|
+
'crowd_cheer': { file: 'crowd_cheer.mp3', triggers: ['crowd cheer', 'applause', 'crowd goes wild', 'give it up', 'make some noise'] },
|
|
28
|
+
'crowd_hype': { file: 'crowd_hype.mp3', triggers: ['crowd hype', 'hype them up', 'get hype'] },
|
|
29
|
+
'yeah': { file: 'yeah.mp3', triggers: ['yeah!', 'yeahhh', 'oh yeah', 'yeeah'] },
|
|
30
|
+
'lets_go': { file: 'lets_go.mp3', triggers: ["let's go!", 'lets go!', "let's goooo", 'here we go'] },
|
|
31
|
+
'gunshot': { file: 'gunshot.mp3', triggers: ['gunshot', 'gun shot', 'bang bang', 'shots fired', 'pow pow', 'blat blat'] },
|
|
32
|
+
'bruh': { file: 'bruh.mp3', triggers: ['bruh', 'bruhhh'] },
|
|
33
|
+
'sad_trombone': { file: 'sad_trombone.mp3', triggers: ['sad trombone', 'womp womp', 'fail', 'wah wah'] }
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
/** @type {Object.<string, HTMLAudioElement>} */
|
|
37
|
+
this.audioCache = {};
|
|
38
|
+
|
|
39
|
+
/** @type {Object.<string, number>} */
|
|
40
|
+
this.lastPlayTime = {};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Preload common sounds so they play instantly on first trigger.
|
|
45
|
+
*/
|
|
46
|
+
init() {
|
|
47
|
+
['air_horn', 'scratch_long', 'crowd_cheer', 'rewind', 'yeah', 'lets_go'].forEach(name => {
|
|
48
|
+
this.preload(name);
|
|
49
|
+
});
|
|
50
|
+
console.log('Soundboard initialized with', Object.keys(this.sounds).length, 'sounds');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Preload a sound into the audio cache.
|
|
55
|
+
* @param {string} soundName
|
|
56
|
+
*/
|
|
57
|
+
preload(soundName) {
|
|
58
|
+
if (!this.sounds[soundName]) return;
|
|
59
|
+
const audio = new Audio(`${this.serverUrl}/sounds/${this.sounds[soundName].file}`);
|
|
60
|
+
audio.preload = 'auto';
|
|
61
|
+
this.audioCache[soundName] = audio;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Play a sound by name. Debounced (500ms) to avoid duplicate fires.
|
|
66
|
+
* @param {string} soundName
|
|
67
|
+
*/
|
|
68
|
+
play(soundName) {
|
|
69
|
+
if (!this.sounds[soundName]) {
|
|
70
|
+
console.warn('Unknown sound:', soundName);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Debounce — don't play same sound within 500ms
|
|
75
|
+
const now = Date.now();
|
|
76
|
+
if (this.lastPlayTime[soundName] && now - this.lastPlayTime[soundName] < 500) {
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
this.lastPlayTime[soundName] = now;
|
|
80
|
+
|
|
81
|
+
// Use cached audio or create a fresh element if cached is still playing
|
|
82
|
+
let audio = this.audioCache[soundName];
|
|
83
|
+
if (!audio || !audio.paused) {
|
|
84
|
+
audio = new Audio(`${this.serverUrl}/sounds/${this.sounds[soundName].file}`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
audio.currentTime = 0;
|
|
88
|
+
audio.volume = 0.4; // Lower than music so voice stays audible
|
|
89
|
+
audio.play().catch(e => console.error('Sound play error:', e));
|
|
90
|
+
console.log('🎧 DJ Sound:', soundName);
|
|
91
|
+
|
|
92
|
+
if (this._eventBus) {
|
|
93
|
+
this._eventBus.emit('soundboard:play', { soundName });
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Scan text for trigger words and play the first matching sound.
|
|
99
|
+
* Only one sound fires per call (first match wins).
|
|
100
|
+
* @param {string} text
|
|
101
|
+
* @returns {string|null} soundName that was played, or null
|
|
102
|
+
*/
|
|
103
|
+
checkTriggers(text) {
|
|
104
|
+
if (!text) return null;
|
|
105
|
+
const lowerText = text.toLowerCase();
|
|
106
|
+
|
|
107
|
+
for (const [soundName, config] of Object.entries(this.sounds)) {
|
|
108
|
+
for (const trigger of config.triggers) {
|
|
109
|
+
if (lowerText.includes(trigger)) {
|
|
110
|
+
this.play(soundName);
|
|
111
|
+
return soundName;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Add or update a sound definition at runtime.
|
|
121
|
+
* @param {string} name
|
|
122
|
+
* @param {string} file filename (relative to /sounds/)
|
|
123
|
+
* @param {string[]} triggers trigger phrases
|
|
124
|
+
*/
|
|
125
|
+
addSound(name, file, triggers = []) {
|
|
126
|
+
this.sounds[name] = { file, triggers };
|
|
127
|
+
}
|
|
128
|
+
}
|
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepgramSTT — Server-side speech recognition via Deepgram Nova-2 API.
|
|
3
|
+
* Captures audio with MediaRecorder, uses VAD to detect speech/silence,
|
|
4
|
+
* sends audio chunks to /api/stt/deepgram for transcription.
|
|
5
|
+
*
|
|
6
|
+
* Drop-in replacement for WebSpeechSTT / GroqSTT with built-in PTT support.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { DeepgramSTT, DeepgramWakeWordDetector } from './DeepgramSTT.js';
|
|
10
|
+
*
|
|
11
|
+
* const stt = new DeepgramSTT();
|
|
12
|
+
* stt.onResult = (text) => console.log('Heard:', text);
|
|
13
|
+
* await stt.start();
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
// ===== DEEPGRAM STT =====
|
|
17
|
+
// Server-side speech recognition via Deepgram Nova-2 API
|
|
18
|
+
class DeepgramSTT {
|
|
19
|
+
constructor(config = {}) {
|
|
20
|
+
this.serverUrl = (config.serverUrl || window.AGENT_CONFIG?.serverUrl || window.location.origin).replace(/\/$/, '');
|
|
21
|
+
this.isListening = false;
|
|
22
|
+
this.onResult = null;
|
|
23
|
+
this.onError = null;
|
|
24
|
+
this.onListenFinal = null; // Listen panel hook — called with each transcript
|
|
25
|
+
this.onInterim = null; // Not used (pre-recorded mode has no interim results)
|
|
26
|
+
this.mediaRecorder = null;
|
|
27
|
+
this.audioChunks = [];
|
|
28
|
+
this.stream = null;
|
|
29
|
+
this.isProcessing = false;
|
|
30
|
+
this.accumulatedText = ''; // PTT compatibility — last transcript
|
|
31
|
+
|
|
32
|
+
// PTT support (built-in, no monkey-patching needed)
|
|
33
|
+
this._micMuted = false;
|
|
34
|
+
this._pttHolding = false;
|
|
35
|
+
this._muteActive = false; // Set by mute(), cleared by resume()
|
|
36
|
+
|
|
37
|
+
// VAD (Voice Activity Detection) settings
|
|
38
|
+
this.silenceTimer = null;
|
|
39
|
+
this.silenceDelayMs = 800; // 0.8s silence = end of speech
|
|
40
|
+
this.accumulationDelayMs = config.accumulationDelayMs || 0;
|
|
41
|
+
this.vadThreshold = 25; // FFT average amplitude threshold
|
|
42
|
+
this.minSpeechMs = 300; // Must sustain above threshold before counting
|
|
43
|
+
this.maxRecordingMs = 45000; // 45s max before auto-chunk
|
|
44
|
+
this.maxRecordingTimer = null;
|
|
45
|
+
this.isSpeaking = false;
|
|
46
|
+
this.stoppingRecorder = false;
|
|
47
|
+
this.hadSpeechInChunk = false;
|
|
48
|
+
this._speechStartTime = 0;
|
|
49
|
+
this._resumedSpeechStart = 0;
|
|
50
|
+
|
|
51
|
+
// Audio analysis for VAD
|
|
52
|
+
this._audioCtx = null;
|
|
53
|
+
this._analyser = null;
|
|
54
|
+
this._vadAnimFrame = null;
|
|
55
|
+
this._accumulationTimer = null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
isSupported() {
|
|
59
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async start() {
|
|
63
|
+
if (this.isListening) return true;
|
|
64
|
+
if (this._micMuted) return false;
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
if (!this.stream || !this.stream.active) {
|
|
68
|
+
this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
this._setupRecorder();
|
|
72
|
+
this._startVAD();
|
|
73
|
+
|
|
74
|
+
this.mediaRecorder.start();
|
|
75
|
+
this.isListening = true;
|
|
76
|
+
console.log('Deepgram STT started');
|
|
77
|
+
return true;
|
|
78
|
+
} catch (error) {
|
|
79
|
+
console.error('Failed to start Deepgram STT:', error);
|
|
80
|
+
if (error.name === 'NotFoundError' || error.name === 'DevicesNotFoundError') {
|
|
81
|
+
if (this.onError) this.onError('no-device');
|
|
82
|
+
} else if (error.name === 'NotAllowedError') {
|
|
83
|
+
if (this.onError) this.onError('not-allowed');
|
|
84
|
+
} else {
|
|
85
|
+
if (this.onError) this.onError(error);
|
|
86
|
+
}
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
_setupRecorder() {
|
|
92
|
+
const options = { mimeType: 'audio/webm;codecs=opus' };
|
|
93
|
+
this.mediaRecorder = new MediaRecorder(this.stream, options);
|
|
94
|
+
this.audioChunks = [];
|
|
95
|
+
|
|
96
|
+
this.mediaRecorder.ondataavailable = (event) => {
|
|
97
|
+
if (event.data.size > 0) {
|
|
98
|
+
this.audioChunks.push(event.data);
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
this.mediaRecorder.onstop = async () => {
|
|
103
|
+
const chunks = this.audioChunks;
|
|
104
|
+
const hadSpeech = this.hadSpeechInChunk;
|
|
105
|
+
this.audioChunks = [];
|
|
106
|
+
this.hadSpeechInChunk = false;
|
|
107
|
+
this.stoppingRecorder = false;
|
|
108
|
+
|
|
109
|
+
// Restart recording immediately to minimize audio gap
|
|
110
|
+
if (this.isListening && !this._micMuted && !this._muteActive && !this._pttHolding) {
|
|
111
|
+
this.isSpeaking = false;
|
|
112
|
+
this.mediaRecorder.start();
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (chunks.length === 0) return;
|
|
116
|
+
|
|
117
|
+
// Discard audio if muted (TTS playing)
|
|
118
|
+
if ((this.isProcessing || this._muteActive) && !this._pttHolding) {
|
|
119
|
+
return;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
this.isProcessing = true;
|
|
123
|
+
|
|
124
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
125
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
126
|
+
|
|
127
|
+
const audioBlob = new Blob(chunks, { type: 'audio/webm' });
|
|
128
|
+
|
|
129
|
+
// Skip if no speech and small audio — prevents hallucinations
|
|
130
|
+
if (!hadSpeech && audioBlob.size < 50000) {
|
|
131
|
+
console.log('Deepgram STT: skipping - no speech detected (' + audioBlob.size + ' bytes)');
|
|
132
|
+
this.isProcessing = false;
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
try {
|
|
137
|
+
console.log('Deepgram STT: sending audio (' + audioBlob.size + ' bytes)');
|
|
138
|
+
const formData = new FormData();
|
|
139
|
+
formData.append('audio', audioBlob, 'audio.webm');
|
|
140
|
+
|
|
141
|
+
const response = await fetch(`${this.serverUrl}/api/stt/deepgram`, {
|
|
142
|
+
method: 'POST',
|
|
143
|
+
body: formData
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const data = await response.json();
|
|
147
|
+
|
|
148
|
+
if (data.transcript && data.transcript.trim()) {
|
|
149
|
+
console.log('Deepgram STT transcript:', data.transcript);
|
|
150
|
+
if (this.onListenFinal) this.onListenFinal(data.transcript);
|
|
151
|
+
|
|
152
|
+
// PTT mode: send immediately
|
|
153
|
+
if (this._micMuted) {
|
|
154
|
+
this.accumulatedText = data.transcript.trim();
|
|
155
|
+
if (this.onResult) this.onResult(this.accumulatedText);
|
|
156
|
+
this.accumulatedText = '';
|
|
157
|
+
} else {
|
|
158
|
+
// Listen mode: accumulate across chunks, send after silence
|
|
159
|
+
this.accumulatedText = this.accumulatedText
|
|
160
|
+
? this.accumulatedText + ' ' + data.transcript.trim()
|
|
161
|
+
: data.transcript.trim();
|
|
162
|
+
|
|
163
|
+
if (this._accumulationTimer) {
|
|
164
|
+
clearTimeout(this._accumulationTimer);
|
|
165
|
+
this._accumulationTimer = null;
|
|
166
|
+
}
|
|
167
|
+
this._accumulationTimer = setTimeout(() => {
|
|
168
|
+
this._accumulationTimer = null;
|
|
169
|
+
const fullText = this.accumulatedText.trim();
|
|
170
|
+
if (fullText && this.onResult) {
|
|
171
|
+
console.log('Deepgram STT accumulated result:', fullText);
|
|
172
|
+
this.onResult(fullText);
|
|
173
|
+
}
|
|
174
|
+
this.accumulatedText = '';
|
|
175
|
+
}, this.accumulationDelayMs);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} catch (error) {
|
|
179
|
+
console.error('Deepgram STT error:', error);
|
|
180
|
+
if (this.onError) this.onError(error);
|
|
181
|
+
} finally {
|
|
182
|
+
this.isProcessing = false;
|
|
183
|
+
}
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
_startVAD() {
|
|
188
|
+
if (this._audioCtx && this._audioCtx.state !== 'closed') {
|
|
189
|
+
if (!this._vadAnimFrame) this._runVADLoop();
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
this._audioCtx = new AudioContext();
|
|
194
|
+
const source = this._audioCtx.createMediaStreamSource(this.stream);
|
|
195
|
+
this._analyser = this._audioCtx.createAnalyser();
|
|
196
|
+
this._analyser.fftSize = 512;
|
|
197
|
+
source.connect(this._analyser);
|
|
198
|
+
|
|
199
|
+
this._runVADLoop();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
_runVADLoop() {
|
|
203
|
+
const bufferLength = this._analyser.frequencyBinCount;
|
|
204
|
+
const dataArray = new Uint8Array(bufferLength);
|
|
205
|
+
|
|
206
|
+
const checkLevel = () => {
|
|
207
|
+
if (!this.isListening) {
|
|
208
|
+
this._vadAnimFrame = null;
|
|
209
|
+
return;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
this._analyser.getByteFrequencyData(dataArray);
|
|
213
|
+
const average = dataArray.reduce((a, b) => a + b) / bufferLength;
|
|
214
|
+
const isSpeakingNow = average > this.vadThreshold;
|
|
215
|
+
|
|
216
|
+
// Skip VAD while muted (TTS playing)
|
|
217
|
+
if (this._muteActive) {
|
|
218
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (isSpeakingNow && !this.isSpeaking) {
|
|
223
|
+
const now = Date.now();
|
|
224
|
+
if (!this._speechStartTime) this._speechStartTime = now;
|
|
225
|
+
if (now - this._speechStartTime < this.minSpeechMs) {
|
|
226
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
this.isSpeaking = true;
|
|
231
|
+
this.hadSpeechInChunk = true;
|
|
232
|
+
this._speechStartTime = 0;
|
|
233
|
+
|
|
234
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
235
|
+
|
|
236
|
+
if (!this.maxRecordingTimer && !this.isProcessing && !this.stoppingRecorder) {
|
|
237
|
+
this.maxRecordingTimer = setTimeout(() => {
|
|
238
|
+
this.maxRecordingTimer = null;
|
|
239
|
+
this.isSpeaking = false;
|
|
240
|
+
this.stoppingRecorder = true;
|
|
241
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
242
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
243
|
+
this.mediaRecorder.stop();
|
|
244
|
+
}
|
|
245
|
+
}, this.maxRecordingMs);
|
|
246
|
+
}
|
|
247
|
+
} else if (isSpeakingNow && this.isSpeaking) {
|
|
248
|
+
const now = Date.now();
|
|
249
|
+
if (!this._resumedSpeechStart) this._resumedSpeechStart = now;
|
|
250
|
+
if (now - this._resumedSpeechStart >= this.minSpeechMs && this.silenceTimer) {
|
|
251
|
+
clearTimeout(this.silenceTimer);
|
|
252
|
+
this.silenceTimer = null;
|
|
253
|
+
this._resumedSpeechStart = 0;
|
|
254
|
+
}
|
|
255
|
+
} else if (!isSpeakingNow && !this.isSpeaking) {
|
|
256
|
+
this._speechStartTime = 0;
|
|
257
|
+
this._resumedSpeechStart = 0;
|
|
258
|
+
} else if (!isSpeakingNow && this.isSpeaking && !this.isProcessing && !this.stoppingRecorder) {
|
|
259
|
+
this._resumedSpeechStart = 0;
|
|
260
|
+
if (!this.silenceTimer) {
|
|
261
|
+
this.silenceTimer = setTimeout(() => {
|
|
262
|
+
this.isSpeaking = false;
|
|
263
|
+
this.stoppingRecorder = true;
|
|
264
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
265
|
+
this.mediaRecorder.stop();
|
|
266
|
+
}
|
|
267
|
+
}, this.silenceDelayMs);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
272
|
+
};
|
|
273
|
+
|
|
274
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
stop() {
|
|
278
|
+
this.isListening = false;
|
|
279
|
+
this.stoppingRecorder = false;
|
|
280
|
+
this._micMuted = false;
|
|
281
|
+
this._muteActive = false;
|
|
282
|
+
|
|
283
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
284
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
285
|
+
if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
|
|
286
|
+
if (this._vadAnimFrame) { cancelAnimationFrame(this._vadAnimFrame); this._vadAnimFrame = null; }
|
|
287
|
+
|
|
288
|
+
if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
|
|
289
|
+
this.mediaRecorder.stop();
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (this.stream) {
|
|
293
|
+
this.stream.getTracks().forEach(track => track.stop());
|
|
294
|
+
this.stream = null;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (this._audioCtx) {
|
|
298
|
+
this._audioCtx.close().catch(() => {});
|
|
299
|
+
this._audioCtx = null;
|
|
300
|
+
this._analyser = null;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
console.log('Deepgram STT stopped');
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
resetProcessing() {
|
|
307
|
+
this.isProcessing = false;
|
|
308
|
+
this.accumulatedText = '';
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
pause() { this.mute(); }
|
|
312
|
+
|
|
313
|
+
mute() {
|
|
314
|
+
this._muteActive = true;
|
|
315
|
+
this.isProcessing = true;
|
|
316
|
+
this.hadSpeechInChunk = false;
|
|
317
|
+
this.accumulatedText = '';
|
|
318
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
319
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
320
|
+
if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
|
|
321
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
322
|
+
this.mediaRecorder.stop();
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
resume() {
|
|
327
|
+
this._muteActive = false;
|
|
328
|
+
this.isProcessing = false;
|
|
329
|
+
this.stoppingRecorder = false;
|
|
330
|
+
this.hadSpeechInChunk = false;
|
|
331
|
+
this.isSpeaking = false;
|
|
332
|
+
this.audioChunks = [];
|
|
333
|
+
|
|
334
|
+
if (this.isListening && !this._micMuted) {
|
|
335
|
+
if (this.stream && this.stream.active) {
|
|
336
|
+
if (!this.mediaRecorder || this.mediaRecorder.stream !== this.stream) {
|
|
337
|
+
this._setupRecorder();
|
|
338
|
+
}
|
|
339
|
+
if (this.mediaRecorder.state === 'inactive') {
|
|
340
|
+
this.mediaRecorder.start();
|
|
341
|
+
}
|
|
342
|
+
if (!this._vadAnimFrame) {
|
|
343
|
+
this._startVAD();
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// --- PTT helpers ---
|
|
350
|
+
|
|
351
|
+
pttActivate() {
|
|
352
|
+
this._pttHolding = true;
|
|
353
|
+
this._micMuted = false;
|
|
354
|
+
this._muteActive = false; // Clear stale TTS mute — PTT overrides
|
|
355
|
+
this.isProcessing = false;
|
|
356
|
+
this.accumulatedText = '';
|
|
357
|
+
this.hadSpeechInChunk = false;
|
|
358
|
+
this.audioChunks = [];
|
|
359
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
360
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
361
|
+
|
|
362
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'inactive') {
|
|
363
|
+
this.mediaRecorder.start();
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
pttRelease() {
|
|
368
|
+
this._pttHolding = false;
|
|
369
|
+
this._micMuted = true;
|
|
370
|
+
this.hadSpeechInChunk = true;
|
|
371
|
+
this.stoppingRecorder = true;
|
|
372
|
+
|
|
373
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
374
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
375
|
+
|
|
376
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
377
|
+
this.mediaRecorder.stop();
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
pttMute() {
|
|
382
|
+
this._pttHolding = false;
|
|
383
|
+
this._micMuted = true;
|
|
384
|
+
this.hadSpeechInChunk = false;
|
|
385
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
386
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
387
|
+
this.isProcessing = true;
|
|
388
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
389
|
+
this.mediaRecorder.stop();
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
pttUnmute() {
|
|
394
|
+
this._micMuted = false;
|
|
395
|
+
this._pttHolding = false;
|
|
396
|
+
this.isProcessing = false;
|
|
397
|
+
this.stoppingRecorder = false;
|
|
398
|
+
this.hadSpeechInChunk = false;
|
|
399
|
+
this.audioChunks = [];
|
|
400
|
+
|
|
401
|
+
if (this.isListening && this.mediaRecorder && this.mediaRecorder.state === 'inactive') {
|
|
402
|
+
this.mediaRecorder.start();
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
// ===== DEEPGRAM WAKE WORD DETECTOR =====
|
|
409
|
+
class DeepgramWakeWordDetector {
|
|
410
|
+
constructor() {
|
|
411
|
+
this.isListening = false;
|
|
412
|
+
this.onWakeWordDetected = null;
|
|
413
|
+
this.wakeWords = ['wake up'];
|
|
414
|
+
this._stt = null;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
isSupported() {
|
|
418
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
async start() {
|
|
422
|
+
if (this.isListening) return true;
|
|
423
|
+
|
|
424
|
+
this._stt = new DeepgramSTT();
|
|
425
|
+
this._stt.silenceDelayMs = 1500;
|
|
426
|
+
this._stt.maxRecordingMs = 10000;
|
|
427
|
+
this._stt.vadThreshold = 40;
|
|
428
|
+
|
|
429
|
+
this._stt.onResult = (transcript) => {
|
|
430
|
+
const lower = transcript.toLowerCase();
|
|
431
|
+
console.log(`Wake word detector heard: "${transcript}"`);
|
|
432
|
+
if (this.wakeWords.some(ww => lower.includes(ww))) {
|
|
433
|
+
console.log('Wake word detected!');
|
|
434
|
+
if (this.onWakeWordDetected) this.onWakeWordDetected();
|
|
435
|
+
}
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
this._stt.onError = (error) => {
|
|
439
|
+
console.warn('Wake word detector error:', error);
|
|
440
|
+
};
|
|
441
|
+
|
|
442
|
+
this.isListening = true;
|
|
443
|
+
const ok = await this._stt.start();
|
|
444
|
+
if (!ok) {
|
|
445
|
+
this.isListening = false;
|
|
446
|
+
return false;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
console.log('Deepgram wake word detector started');
|
|
450
|
+
return true;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
stop() {
|
|
454
|
+
this.isListening = false;
|
|
455
|
+
if (this._stt) {
|
|
456
|
+
this._stt.stop();
|
|
457
|
+
this._stt = null;
|
|
458
|
+
}
|
|
459
|
+
console.log('Deepgram wake word detector stopped');
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
async toggle() {
|
|
463
|
+
if (this.isListening) {
|
|
464
|
+
this.stop();
|
|
465
|
+
return false;
|
|
466
|
+
} else {
|
|
467
|
+
return await this.start();
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
export { DeepgramSTT, DeepgramWakeWordDetector };
|