openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,2158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
routes/conversation.py — Conversation & TTS Blueprint (P2-T3)
|
|
3
|
+
|
|
4
|
+
Extracted from server.py during Phase 2 blueprint split.
|
|
5
|
+
Registers routes:
|
|
6
|
+
POST /api/conversation (main voice conversation endpoint)
|
|
7
|
+
POST /api/conversation/reset (clear conversation history for a session)
|
|
8
|
+
GET /api/tts/providers (list available TTS providers)
|
|
9
|
+
POST /api/tts/generate (generate TTS audio from text)
|
|
10
|
+
POST /api/supertonic-tts (deprecated legacy TTS endpoint)
|
|
11
|
+
|
|
12
|
+
Also exports helpers used by other server.py code:
|
|
13
|
+
get_voice_session_key()
|
|
14
|
+
bump_voice_session()
|
|
15
|
+
conversation_histories (dict of session histories)
|
|
16
|
+
_consecutive_empty_responses (module global, accessed via this module)
|
|
17
|
+
clean_for_tts()
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import base64
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import queue
|
|
25
|
+
import re
|
|
26
|
+
import sqlite3
|
|
27
|
+
import threading
|
|
28
|
+
import time
|
|
29
|
+
from datetime import datetime
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
from flask import Blueprint, Response, jsonify, make_response, request
|
|
33
|
+
|
|
34
|
+
from routes.canvas import canvas_context, update_canvas_context, CANVAS_PAGES_DIR
|
|
35
|
+
from routes.transcripts import save_conversation_turn
|
|
36
|
+
from routes.music import current_music_state as _music_state
|
|
37
|
+
from services.gateway_manager import gateway_manager
|
|
38
|
+
from services.gateways.compat import is_system_response
|
|
39
|
+
from services.tts import generate_tts_b64 as _tts_generate_b64
|
|
40
|
+
from tts_providers import get_provider, list_providers
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Constants
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
from services.paths import DB_PATH, VOICE_SESSION_FILE
|
|
49
|
+
|
|
50
|
+
BRAIN_EVENTS_PATH = Path('/tmp/openvoiceui-events.jsonl')
|
|
51
|
+
MAX_HISTORY_MESSAGES = 20
|
|
52
|
+
|
|
53
|
+
# Vision keyword detection — triggers camera frame analysis via GLM-4V
|
|
54
|
+
_VISION_KEYWORDS = (
|
|
55
|
+
'what do you see', 'what can you see', 'what are you seeing',
|
|
56
|
+
'look at', 'what is in front', "what's in front",
|
|
57
|
+
'describe what', 'tell me what you see', 'can you see',
|
|
58
|
+
'what is that', "what's that", 'who is that', "who's that",
|
|
59
|
+
'what am i holding', 'what am i wearing', 'what does it look like',
|
|
60
|
+
'what am i showing', 'what is this', "what's this",
|
|
61
|
+
'show me what you see', 'use the camera', 'check the camera',
|
|
62
|
+
'look through the camera', 'do you see', 'you see this',
|
|
63
|
+
'take a look', 'what color', 'read this', 'read that',
|
|
64
|
+
)
|
|
65
|
+
_VISION_FRAME_MAX_AGE = 10 # seconds — ignore frames older than this
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# Voice assistant instructions — injected into every message context.
|
|
69
|
+
#
|
|
70
|
+
# PRIMARY SOURCE: prompts/voice-system-prompt.md (hot-reload, no restart needed)
|
|
71
|
+
# Editable via admin API: PUT /api/instructions/voice-system-prompt
|
|
72
|
+
#
|
|
73
|
+
# FALLBACK: _VOICE_INSTRUCTIONS constant below (used if file missing/unreadable)
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
_PROMPTS_DIR = Path(__file__).parent.parent / 'prompts'
|
|
77
|
+
_VOICE_PROMPT_FILE = _PROMPTS_DIR / 'voice-system-prompt.md'
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _load_voice_system_prompt() -> str:
|
|
81
|
+
"""Load voice-system-prompt.md, stripping # comment lines. Hot-reloads every call.
|
|
82
|
+
Falls back to _VOICE_INSTRUCTIONS if the file is missing or unreadable."""
|
|
83
|
+
try:
|
|
84
|
+
raw = _VOICE_PROMPT_FILE.read_text(encoding='utf-8')
|
|
85
|
+
lines = [l for l in raw.splitlines() if not l.startswith('#')]
|
|
86
|
+
content = ' '.join(line.strip() for line in lines if line.strip())
|
|
87
|
+
if content:
|
|
88
|
+
return content
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
return _VOICE_INSTRUCTIONS # fallback to hardcoded constant
|
|
92
|
+
_VOICE_INSTRUCTIONS = (
|
|
93
|
+
"[OPENVOICEUI SYSTEM INSTRUCTIONS: "
|
|
94
|
+
|
|
95
|
+
# --- Voice & Tone ---
|
|
96
|
+
"You are a voice AI assistant. ALWAYS respond in English — never Chinese or any other language. "
|
|
97
|
+
"Respond in natural, conversational tone — NO markdown (no #, -, *, bullet lists, or tables). "
|
|
98
|
+
"Be brief and direct. Never sound like a call center agent or a search engine. "
|
|
99
|
+
"BANNED OPENERS — never start a response with: 'Hey there', 'Great question', 'Absolutely', "
|
|
100
|
+
"'Of course', 'Certainly', 'Sure thing', 'I hear you', 'I understand you saying', "
|
|
101
|
+
"'That's a great', or any variation. Just answer. "
|
|
102
|
+
"Do NOT repeat or paraphrase what the user just said. Do NOT end every reply with a question. "
|
|
103
|
+
|
|
104
|
+
# --- Identity ---
|
|
105
|
+
"IDENTITY: Do NOT address anyone by name unless a [FACE RECOGNITION] tag appears in this "
|
|
106
|
+
"exact message confirming their identity. Different people use this interface. "
|
|
107
|
+
"Never use names from memory or prior sessions without face recognition in this message. "
|
|
108
|
+
|
|
109
|
+
# --- Critical tag rule ---
|
|
110
|
+
"CRITICAL — EVERY RESPONSE MUST CONTAIN SPOKEN WORDS alongside any action tags. "
|
|
111
|
+
"NEVER output a bare tag alone — the user hears silence and sees nothing. "
|
|
112
|
+
"BAD: [CANVAS:page-id] GOOD: Here's your dashboard. [CANVAS:page-id] "
|
|
113
|
+
"BAD: [MUSIC_PLAY] GOOD: Playing something for you now. [MUSIC_PLAY] "
|
|
114
|
+
"Tags are invisible to the user — they only hear your words. "
|
|
115
|
+
|
|
116
|
+
# --- Canvas: open existing page ---
|
|
117
|
+
"CANVAS TAGS: "
|
|
118
|
+
"[CANVAS:page-id] — opens a canvas page. Use exact page-id from the [Canvas pages:] list above. "
|
|
119
|
+
"When opening, briefly say what the page shows (1-2 sentences). "
|
|
120
|
+
"NEVER use the openclaw 'canvas' tool with action:'present' — it fails with 'node required'. "
|
|
121
|
+
"ONLY the [CANVAS:page-id] tag works to open pages. "
|
|
122
|
+
"Repeating [CANVAS:same-page] on an already-open page forces a refresh. "
|
|
123
|
+
"[CANVAS_MENU] — opens the page picker so the user can browse all pages. "
|
|
124
|
+
"[CANVAS_URL:https://example.com] — loads an external URL in the canvas iframe "
|
|
125
|
+
"(only sites that allow iframe embedding). "
|
|
126
|
+
|
|
127
|
+
# --- Canvas: create a new page ---
|
|
128
|
+
"CREATING A NEW CANVAS PAGE: "
|
|
129
|
+
"Step 1 — write the HTML file: write({path:'workspace/canvas/pagename.html', content:'<!DOCTYPE html>...'}). "
|
|
130
|
+
"Step 2 — open it in your spoken response: 'Here it is. [CANVAS:pagename]' "
|
|
131
|
+
"Step 3 — verify it opened: exec('curl -s http://openvoiceui:5001/api/canvas/context') "
|
|
132
|
+
"returns {current_page, current_title}. If current_page matches → confirm to user. "
|
|
133
|
+
"If still old page → say so and resend [CANVAS:pagename]. If null → say 'Opening canvas now.' and resend. "
|
|
134
|
+
|
|
135
|
+
# --- Canvas: HTML rules ---
|
|
136
|
+
"CANVAS HTML RULES (mandatory for every canvas page you create): "
|
|
137
|
+
"NO external CDN scripts — Tailwind CDN, Bootstrap CDN, any <script src='https://...'> are BANNED (break in sandboxed iframes). "
|
|
138
|
+
"All CSS and JS must be inline in <style> and <script> tags only. "
|
|
139
|
+
"Google Fonts @import url(...) in <style> is OK. "
|
|
140
|
+
"Dark theme: background #0d1117 or #13141a, text #e2e8f0, accent blue #3b82f6 or amber #f59e0b. "
|
|
141
|
+
"Body: padding:20px; color:#e2e8f0; background:#0a0a0a; "
|
|
142
|
+
"Make pages visual — cards, grids, tables, real data. No blank pages. "
|
|
143
|
+
|
|
144
|
+
# --- Canvas: interactive buttons ---
|
|
145
|
+
"CANVAS INTERACTIVE BUTTONS — use postMessage, never href='#': "
|
|
146
|
+
"Trigger AI action: onclick=\"window.parent.postMessage({type:'canvas-action',action:'speak',text:'your message'},'*')\" "
|
|
147
|
+
"Open another page: onclick=\"window.parent.postMessage({type:'canvas-action',action:'navigate',page:'page-id'},'*')\" "
|
|
148
|
+
"Open page menu: onclick=\"window.parent.postMessage({type:'canvas-action',action:'menu'},'*')\" "
|
|
149
|
+
"Close canvas: onclick=\"window.parent.postMessage({type:'canvas-action',action:'close'},'*')\" "
|
|
150
|
+
"External links: use <a href='https://...' target='_blank'> — never href='#'. "
|
|
151
|
+
|
|
152
|
+
# --- Canvas: make public ---
|
|
153
|
+
"MAKE A PAGE PUBLIC (shareable without login): "
|
|
154
|
+
"exec('curl -s -X PATCH http://openvoiceui:5001/api/canvas/manifest/page/PAGE_ID "
|
|
155
|
+
"-H \"Content-Type: application/json\" -d \\'{{\"is_public\": true}}\\'') "
|
|
156
|
+
"Shareable URL format: https://DOMAIN/pages/pagename.html "
|
|
157
|
+
|
|
158
|
+
# --- Music ---
|
|
159
|
+
"MUSIC TAGS: "
|
|
160
|
+
"[MUSIC_PLAY] — play a random track. "
|
|
161
|
+
"[MUSIC_PLAY:track name] — play specific track (use exact title from [Available tracks:] list above). "
|
|
162
|
+
"[MUSIC_STOP] — stop music. "
|
|
163
|
+
"[MUSIC_NEXT] — skip to next track. "
|
|
164
|
+
"Only use music tags when the user explicitly asks — "
|
|
165
|
+
"EXCEPT: when opening a music-related canvas page (music-list, playlist, library, etc.), "
|
|
166
|
+
"also send [MUSIC_PLAY] in the same response so music starts playing alongside the page. "
|
|
167
|
+
|
|
168
|
+
# --- Suno song generation ---
|
|
169
|
+
"SONG GENERATION: "
|
|
170
|
+
"[SUNO_GENERATE:description] — generates an AI song (~45 seconds). "
|
|
171
|
+
"Always say something like 'I'll get that cooking now, should be ready in about 45 seconds!' "
|
|
172
|
+
"The frontend handles Suno — do NOT call any Suno APIs yourself. "
|
|
173
|
+
"After generation, the new song appears in [Available tracks:] by its title. "
|
|
174
|
+
"Use [MUSIC_PLAY:song title] to play it — do NOT use exec/shell to find the file. "
|
|
175
|
+
|
|
176
|
+
# --- Spotify ---
|
|
177
|
+
"SPOTIFY: [SPOTIFY:song name] or [SPOTIFY:song name|artist name] — plays from Spotify. "
|
|
178
|
+
"Example: [SPOTIFY:Bohemian Rhapsody|Queen]. Only use when user specifically asks. "
|
|
179
|
+
|
|
180
|
+
# --- Sleep / goodbye ---
|
|
181
|
+
"SLEEP: [SLEEP] — puts interface into passive wake-word mode. "
|
|
182
|
+
"Use when user says goodbye, goodnight, stop listening, go to sleep, I'm out, peace, later, or similar. "
|
|
183
|
+
"Always give a brief farewell (1-2 sentences) BEFORE the [SLEEP] tag. "
|
|
184
|
+
"NEVER acknowledge that you 'should' sleep without including the [SLEEP] tag — the tag IS the action. "
|
|
185
|
+
|
|
186
|
+
# --- Session reset ---
|
|
187
|
+
"[SESSION_RESET] — clears conversation history and starts fresh. "
|
|
188
|
+
"Use sparingly — only when context is clearly broken or user explicitly asks to start over. "
|
|
189
|
+
|
|
190
|
+
# --- DJ soundboard ---
|
|
191
|
+
"DJ SOUNDBOARD: [SOUND:name] — plays a sound effect. "
|
|
192
|
+
"ONLY use in DJ mode (user explicitly said 'be a DJ', 'DJ mode', or 'put on a set'). "
|
|
193
|
+
"NEVER use in normal conversation. "
|
|
194
|
+
"Available sounds: air_horn, scratch_long, rewind, record_stop, crowd_cheer, crowd_hype, "
|
|
195
|
+
"yeah, lets_go, gunshot, bruh, sad_trombone. "
|
|
196
|
+
|
|
197
|
+
# --- Onboarding notifications ---
|
|
198
|
+
"ONBOARDING NOTIFICATIONS (popup at top-center of screen): "
|
|
199
|
+
"[NOTIFY:message] — show/update popup message. "
|
|
200
|
+
"[NOTIFY_TITLE:text] — update popup title bar. "
|
|
201
|
+
"[NOTIFY_PROGRESS:N/M] — show step progress dots (e.g. [NOTIFY_PROGRESS:2/5]). "
|
|
202
|
+
"[NOTIFY_STATUS:text] — update small status line (e.g. '3 agents working...'). "
|
|
203
|
+
"[NOTIFY_CLOSE] — hide popup temporarily. "
|
|
204
|
+
"[NOTIFY_COMPLETE] — mark onboarding done (shows success, then auto-dismisses). "
|
|
205
|
+
|
|
206
|
+
# --- Face registration ---
|
|
207
|
+
"[REGISTER_FACE:Name] — captures and saves the person's face from camera. "
|
|
208
|
+
"Only use when someone explicitly asks or introduces themselves. "
|
|
209
|
+
"If camera is off, let them know. "
|
|
210
|
+
|
|
211
|
+
# --- Camera vision ---
|
|
212
|
+
"CAMERA VISION: When a [CAMERA VISION: ...] tag appears in the context above, "
|
|
213
|
+
"it describes what the camera currently sees. Use it to answer the user's question naturally — "
|
|
214
|
+
"do not repeat the raw description verbatim. If it says camera is off, let the user know. "
|
|
215
|
+
|
|
216
|
+
"]"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _is_vision_request(msg: str) -> bool:
|
|
221
|
+
"""Return True if the user message looks like a request to use the camera/vision."""
|
|
222
|
+
lower = msg.lower()
|
|
223
|
+
return any(kw in lower for kw in _VISION_KEYWORDS)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _cap_list(items, max_chars=2000, label="items"):
|
|
227
|
+
"""Join items with ', ' but cap at max_chars. Add '... and N more' if truncated."""
|
|
228
|
+
if not items:
|
|
229
|
+
return "none"
|
|
230
|
+
result = []
|
|
231
|
+
total = 0
|
|
232
|
+
for item in items:
|
|
233
|
+
addition = len(item) + (2 if result else 0) # ', ' separator
|
|
234
|
+
if total + addition > max_chars and result:
|
|
235
|
+
remaining = len(items) - len(result)
|
|
236
|
+
result.append(f"... and {remaining} more")
|
|
237
|
+
break
|
|
238
|
+
result.append(item)
|
|
239
|
+
total += addition
|
|
240
|
+
return ', '.join(result)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ---------------------------------------------------------------------------
|
|
244
|
+
# DB write queue — background thread so DB writes don't block HTTP responses
|
|
245
|
+
# (FIND-01 / FIND-08 fix from performance audit)
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
_db_write_queue: queue.Queue = queue.Queue()
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _db_writer_loop():
|
|
252
|
+
"""Background daemon that drains _db_write_queue and writes to SQLite.
|
|
253
|
+
|
|
254
|
+
Queue items: (db_path_str, sql, params).
|
|
255
|
+
db_path_str is resolved at enqueue time so test patches to DB_PATH work.
|
|
256
|
+
Connections are cached per db_path to reuse WAL-mode connections.
|
|
257
|
+
"""
|
|
258
|
+
connections: dict = {}
|
|
259
|
+
while True:
|
|
260
|
+
try:
|
|
261
|
+
db_path_str, sql, params = _db_write_queue.get(timeout=5)
|
|
262
|
+
except queue.Empty:
|
|
263
|
+
continue
|
|
264
|
+
try:
|
|
265
|
+
if db_path_str not in connections:
|
|
266
|
+
conn = sqlite3.connect(db_path_str, check_same_thread=False, timeout=30)
|
|
267
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
268
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
269
|
+
conn.execute("PRAGMA cache_size=-64000")
|
|
270
|
+
conn.execute("PRAGMA busy_timeout=30000")
|
|
271
|
+
connections[db_path_str] = conn
|
|
272
|
+
connections[db_path_str].execute(sql, params)
|
|
273
|
+
connections[db_path_str].commit()
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"[db-writer] loop error: {e}")
|
|
276
|
+
finally:
|
|
277
|
+
_db_write_queue.task_done()
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
_db_writer_thread = threading.Thread(
|
|
281
|
+
target=_db_writer_loop,
|
|
282
|
+
name="conv-db-writer",
|
|
283
|
+
daemon=True,
|
|
284
|
+
)
|
|
285
|
+
_db_writer_thread.start()
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _flush_db_writes(timeout: float = 5.0) -> None:
|
|
289
|
+
"""Block until all queued DB writes are processed. For use in tests."""
|
|
290
|
+
_db_write_queue.join()
|
|
291
|
+
|
|
292
|
+
# ---------------------------------------------------------------------------
|
|
293
|
+
# In-memory session key cache (FIND-02 fix from performance audit)
|
|
294
|
+
# ---------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
_session_key_cache: str | None = None
|
|
297
|
+
_session_key_lock = threading.Lock()
|
|
298
|
+
_session_recovery_key: str | None = None # Set after double-empty to escape poisoned session
|
|
299
|
+
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
# Conversation state (module-level singletons)
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
#: In-process conversation history keyed by session_id.
|
|
305
|
+
#: Cleared on conversation reset; also restored from DB on first access.
|
|
306
|
+
conversation_histories: dict = {}
|
|
307
|
+
|
|
308
|
+
#: Tracks consecutive empty Gateway responses for auto-reset logic.
|
|
309
|
+
_consecutive_empty_responses: int = 0
|
|
310
|
+
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
# Voice session management
|
|
313
|
+
# (moved here from server.py so the blueprint owns the session counter)
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _save_session_counter(counter: int) -> None:
|
|
318
|
+
with open(VOICE_SESSION_FILE, 'w') as f:
|
|
319
|
+
f.write(str(counter))
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def get_voice_session_key() -> str:
|
|
323
|
+
"""Return the current voice session key.
|
|
324
|
+
|
|
325
|
+
Uses a STABLE key (no incrementing counter) so the Z.AI prompt cache
|
|
326
|
+
stays warm across session resets. OpenClaw's daily reset handles context
|
|
327
|
+
clearing — we don't need a new key for that.
|
|
328
|
+
|
|
329
|
+
If the session is poisoned (double-empty detected), returns a recovery key
|
|
330
|
+
to force openclaw onto a fresh session. Cleared on first successful response.
|
|
331
|
+
|
|
332
|
+
Priority: recovery key → GATEWAY_SESSION_KEY env → VOICE_SESSION_PREFIX env → 'voice-main'
|
|
333
|
+
Cache is invalidated by bump_voice_session() (explicit agent reset only).
|
|
334
|
+
"""
|
|
335
|
+
global _session_key_cache
|
|
336
|
+
# Auto-clear stale recovery keys (stuck >60s)
|
|
337
|
+
_check_recovery_timeout()
|
|
338
|
+
# If session is poisoned, use recovery key to escape
|
|
339
|
+
if _session_recovery_key is not None:
|
|
340
|
+
return _session_recovery_key
|
|
341
|
+
if _session_key_cache is not None:
|
|
342
|
+
return _session_key_cache
|
|
343
|
+
with _session_key_lock:
|
|
344
|
+
if _session_key_cache is not None:
|
|
345
|
+
return _session_key_cache
|
|
346
|
+
# Use GATEWAY_SESSION_KEY if set (unique per user), else prefix
|
|
347
|
+
_gw_key = os.getenv('GATEWAY_SESSION_KEY')
|
|
348
|
+
if _gw_key:
|
|
349
|
+
_session_key_cache = _gw_key
|
|
350
|
+
else:
|
|
351
|
+
_prefix = os.getenv('VOICE_SESSION_PREFIX', 'voice-main')
|
|
352
|
+
_session_key_cache = _prefix
|
|
353
|
+
return _session_key_cache
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def bump_voice_session() -> str:
|
|
357
|
+
"""Increment the session counter and invalidate the cache so the key
|
|
358
|
+
is re-read from GATEWAY_SESSION_KEY on next call.
|
|
359
|
+
|
|
360
|
+
The counter file is still incremented for logging/tracking how many
|
|
361
|
+
resets have occurred, but the actual session key stays stable (e.g.
|
|
362
|
+
'main') so it matches the heartbeat session and keeps the Z.AI prompt
|
|
363
|
+
cache warm.
|
|
364
|
+
"""
|
|
365
|
+
global _consecutive_empty_responses, _session_key_cache
|
|
366
|
+
try:
|
|
367
|
+
with open(VOICE_SESSION_FILE, 'r') as f:
|
|
368
|
+
counter = int(f.read().strip())
|
|
369
|
+
except (FileNotFoundError, ValueError):
|
|
370
|
+
counter = 6
|
|
371
|
+
counter += 1
|
|
372
|
+
_save_session_counter(counter)
|
|
373
|
+
_consecutive_empty_responses = 0
|
|
374
|
+
with _session_key_lock:
|
|
375
|
+
_session_key_cache = None # invalidate cache; next call re-reads env var
|
|
376
|
+
stable_key = get_voice_session_key()
|
|
377
|
+
logger.info(f'### SESSION RESET #{counter}: cache invalidated, key stays stable as "{stable_key}"')
|
|
378
|
+
return stable_key
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
_recovery_entered_at: float = 0
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _enter_session_recovery():
|
|
385
|
+
"""Switch to a temporary recovery session key after double-empty.
|
|
386
|
+
Openclaw will create a fresh session for this key, escaping the
|
|
387
|
+
poisoned state. The recovery key is cleared on the first successful
|
|
388
|
+
(non-empty, non-fallback) response."""
|
|
389
|
+
global _session_recovery_key, _recovery_entered_at
|
|
390
|
+
import datetime
|
|
391
|
+
# Cooldown: don't thrash recovery keys from rapid start/stop cycles
|
|
392
|
+
now = time.time()
|
|
393
|
+
if now - _recovery_entered_at < 30:
|
|
394
|
+
logger.info('### SESSION RECOVERY: skipping — cooldown active (entered <30s ago)')
|
|
395
|
+
return
|
|
396
|
+
_recovery_entered_at = now
|
|
397
|
+
_session_recovery_key = f'recovery-{int(datetime.datetime.utcnow().timestamp())}'
|
|
398
|
+
logger.warning(f'### SESSION RECOVERY: switching to key "{_session_recovery_key}" to escape poisoned session')
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _exit_session_recovery():
|
|
402
|
+
"""Clear the recovery key after a successful response.
|
|
403
|
+
Next request goes back to the stable key (cache-warm path)."""
|
|
404
|
+
global _session_recovery_key
|
|
405
|
+
if _session_recovery_key is not None:
|
|
406
|
+
old_recovery = _session_recovery_key
|
|
407
|
+
_session_recovery_key = None
|
|
408
|
+
stable = get_voice_session_key()
|
|
409
|
+
logger.info(f'### SESSION RECOVERY CLEARED: "{old_recovery}" → back to stable key "{stable}"')
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _check_recovery_timeout():
|
|
413
|
+
"""Auto-clear stale recovery keys. If recovery has been active for >60s
|
|
414
|
+
without a successful response, the recovery key itself may be stuck.
|
|
415
|
+
Fall back to stable key."""
|
|
416
|
+
global _session_recovery_key, _recovery_entered_at
|
|
417
|
+
if _session_recovery_key is not None and time.time() - _recovery_entered_at > 60:
|
|
418
|
+
logger.warning(f'### SESSION RECOVERY TIMEOUT: "{_session_recovery_key}" active for >60s — clearing')
|
|
419
|
+
_session_recovery_key = None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# ---------------------------------------------------------------------------
|
|
423
|
+
# Helper: notify Brain (non-critical fire-and-forget)
|
|
424
|
+
# ---------------------------------------------------------------------------
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _notify_brain(event_type: str, **data) -> None:
|
|
428
|
+
"""Append an event to the Brain events file for context tracking."""
|
|
429
|
+
try:
|
|
430
|
+
event = {'type': event_type, 'timestamp': datetime.now().isoformat()}
|
|
431
|
+
event.update(data)
|
|
432
|
+
with open(BRAIN_EVENTS_PATH, 'a') as f:
|
|
433
|
+
f.write(json.dumps(event) + '\n')
|
|
434
|
+
except Exception:
|
|
435
|
+
pass # Non-critical
|
|
436
|
+
|
|
437
|
+
# ---------------------------------------------------------------------------
|
|
438
|
+
# Helper: log conversation to SQLite
|
|
439
|
+
# ---------------------------------------------------------------------------
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def log_conversation(role: str, message: str, session_id: str = 'default',
|
|
443
|
+
tts_provider: str = None, voice: str = None) -> None:
|
|
444
|
+
"""Log a single conversation turn to the database (non-blocking).
|
|
445
|
+
|
|
446
|
+
Write is queued to the background db-writer thread (FIND-01 fix).
|
|
447
|
+
"""
|
|
448
|
+
_db_write_queue.put((
|
|
449
|
+
str(DB_PATH),
|
|
450
|
+
'INSERT INTO conversation_log '
|
|
451
|
+
'(session_id, role, message, tts_provider, voice, created_at) '
|
|
452
|
+
'VALUES (?, ?, ?, ?, ?, ?)',
|
|
453
|
+
(session_id, role, message, tts_provider, voice, datetime.now().isoformat()),
|
|
454
|
+
))
|
|
455
|
+
_notify_brain('conversation', role=role, message=message, session=session_id)
|
|
456
|
+
|
|
457
|
+
# ---------------------------------------------------------------------------
|
|
458
|
+
# Helper: log timing metrics
|
|
459
|
+
# ---------------------------------------------------------------------------
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def log_metrics(metrics: dict) -> None:
|
|
463
|
+
"""Log conversation timing metrics to SQLite + journalctl (non-blocking).
|
|
464
|
+
|
|
465
|
+
Write is queued to the background db-writer thread (FIND-01 fix).
|
|
466
|
+
"""
|
|
467
|
+
logger.info(
|
|
468
|
+
f"[METRICS] profile={metrics.get('profile')} "
|
|
469
|
+
f"handshake={metrics.get('handshake_ms')}ms "
|
|
470
|
+
f"llm={metrics.get('llm_inference_ms')}ms "
|
|
471
|
+
f"tts={metrics.get('tts_generation_ms')}ms "
|
|
472
|
+
f"total={metrics.get('total_ms')}ms "
|
|
473
|
+
f"resp_len={metrics.get('response_len')} "
|
|
474
|
+
f"tts_ok={metrics.get('tts_success', 1)} "
|
|
475
|
+
f"tools={metrics.get('tool_count', 0)} "
|
|
476
|
+
f"fallback={metrics.get('fallback_used', 0)}"
|
|
477
|
+
)
|
|
478
|
+
_db_write_queue.put((
|
|
479
|
+
str(DB_PATH),
|
|
480
|
+
'''INSERT INTO conversation_metrics
|
|
481
|
+
(session_id, profile, model, handshake_ms, llm_inference_ms,
|
|
482
|
+
tts_generation_ms, total_ms, user_message_len, response_len,
|
|
483
|
+
tts_text_len, tts_provider, tts_success, tts_error,
|
|
484
|
+
tool_count, fallback_used, error, created_at)
|
|
485
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
|
|
486
|
+
(
|
|
487
|
+
metrics.get('session_id', 'default'),
|
|
488
|
+
metrics.get('profile', 'unknown'),
|
|
489
|
+
metrics.get('model', 'unknown'),
|
|
490
|
+
metrics.get('handshake_ms'),
|
|
491
|
+
metrics.get('llm_inference_ms'),
|
|
492
|
+
metrics.get('tts_generation_ms'),
|
|
493
|
+
metrics.get('total_ms'),
|
|
494
|
+
metrics.get('user_message_len'),
|
|
495
|
+
metrics.get('response_len'),
|
|
496
|
+
metrics.get('tts_text_len'),
|
|
497
|
+
metrics.get('tts_provider'),
|
|
498
|
+
metrics.get('tts_success', 1),
|
|
499
|
+
metrics.get('tts_error'),
|
|
500
|
+
metrics.get('tool_count', 0),
|
|
501
|
+
metrics.get('fallback_used', 0),
|
|
502
|
+
metrics.get('error'),
|
|
503
|
+
datetime.now().isoformat(),
|
|
504
|
+
),
|
|
505
|
+
))
|
|
506
|
+
|
|
507
|
+
# ---------------------------------------------------------------------------
|
|
508
|
+
# Helper: clean text for TTS
|
|
509
|
+
# ---------------------------------------------------------------------------
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _truncate_at_sentence(text: str, max_chars: int) -> str:
|
|
513
|
+
"""Truncate text at the nearest sentence boundary at or before max_chars.
|
|
514
|
+
Falls back to hard truncation if no boundary is found."""
|
|
515
|
+
if not text or len(text) <= max_chars:
|
|
516
|
+
return text
|
|
517
|
+
chunk = text[:max_chars]
|
|
518
|
+
# Find last sentence-ending punctuation before the cap
|
|
519
|
+
last_boundary = max(chunk.rfind('.'), chunk.rfind('!'), chunk.rfind('?'))
|
|
520
|
+
if last_boundary > 0:
|
|
521
|
+
return chunk[:last_boundary + 1].strip()
|
|
522
|
+
return chunk.strip()
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def clean_for_tts(text: str) -> str:
|
|
526
|
+
"""Remove markdown, reasoning tokens, and non-speech characters for TTS."""
|
|
527
|
+
if not text:
|
|
528
|
+
return ''
|
|
529
|
+
|
|
530
|
+
# Strip GPT-OSS-120B reasoning tokens (but not if NO/YES is the full response)
|
|
531
|
+
if text.strip().upper() not in ['NO', 'YES', 'NO.', 'YES.']:
|
|
532
|
+
text = re.sub(r'^NO_REPLY\s*', '', text)
|
|
533
|
+
text = re.sub(r'\s+NO\s*$', '', text, flags=re.IGNORECASE)
|
|
534
|
+
text = re.sub(r'\s+YES\s*$', '', text, flags=re.IGNORECASE)
|
|
535
|
+
|
|
536
|
+
# Remove canvas/task/music triggers (handled by frontend, not spoken)
|
|
537
|
+
text = re.sub(r'\[CANVAS_MENU\]', '', text, flags=re.IGNORECASE)
|
|
538
|
+
text = re.sub(r'\[CANVAS:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
539
|
+
text = re.sub(r'\[CANVAS_URL:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
540
|
+
text = re.sub(r'\[MUSIC_PLAY(?::[^\]]*)?\]', '', text, flags=re.IGNORECASE)
|
|
541
|
+
text = re.sub(r'\[MUSIC_STOP\]', '', text, flags=re.IGNORECASE)
|
|
542
|
+
text = re.sub(r'\[MUSIC_NEXT\]', '', text, flags=re.IGNORECASE)
|
|
543
|
+
text = re.sub(r'\[SUNO_GENERATE:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
544
|
+
text = re.sub(r'\[SLEEP\]', '', text, flags=re.IGNORECASE)
|
|
545
|
+
text = re.sub(r'\[REGISTER_FACE:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
546
|
+
text = re.sub(r'\[SPOTIFY:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
547
|
+
text = re.sub(r'\[SOUND:[^\]]*\]', '', text, flags=re.IGNORECASE)
|
|
548
|
+
text = re.sub(r'\[SESSION_RESET\]', '', text, flags=re.IGNORECASE)
|
|
549
|
+
|
|
550
|
+
# Remove code blocks (complete fences first, then any unclosed fence to end of text)
|
|
551
|
+
text = re.sub(r'```[\s\S]*?```', '', text)
|
|
552
|
+
text = re.sub(r'```[\s\S]*', '', text)
|
|
553
|
+
text = re.sub(r'`[^`]+`', '', text)
|
|
554
|
+
|
|
555
|
+
# Add natural pauses for structured content (must happen before stripping markdown)
|
|
556
|
+
text = re.sub(r'^(#+\s+.+?)([^.!?])\s*$', r'\1\2.', text, flags=re.MULTILINE)
|
|
557
|
+
|
|
558
|
+
def _ensure_list_item_pause(match):
|
|
559
|
+
prefix = match.group(1)
|
|
560
|
+
content = match.group(2).strip()
|
|
561
|
+
if content and content[-1] not in '.!?:':
|
|
562
|
+
content += '.'
|
|
563
|
+
return f'{prefix} {content}'
|
|
564
|
+
text = re.sub(r'^(\s*\d+[.)]\s*)(.+?)$', _ensure_list_item_pause,
|
|
565
|
+
text, flags=re.MULTILINE)
|
|
566
|
+
|
|
567
|
+
def _ensure_bullet_pause(match):
|
|
568
|
+
content = match.group(1).strip()
|
|
569
|
+
if content and content[-1] not in '.!?:':
|
|
570
|
+
content += '.'
|
|
571
|
+
return content
|
|
572
|
+
text = re.sub(r'^\s*[-*•]\s+(.+?)$', _ensure_bullet_pause,
|
|
573
|
+
text, flags=re.MULTILINE)
|
|
574
|
+
|
|
575
|
+
def _table_row_to_speech(match):
|
|
576
|
+
row = match.group(0)
|
|
577
|
+
if re.match(r'^[\s|:-]+$', row):
|
|
578
|
+
return ''
|
|
579
|
+
cells = [c.strip() for c in row.split('|') if c.strip()]
|
|
580
|
+
if not cells:
|
|
581
|
+
return ''
|
|
582
|
+
return ', '.join(cells) + '.'
|
|
583
|
+
text = re.sub(r'^\|.+\|$', _table_row_to_speech, text, flags=re.MULTILINE)
|
|
584
|
+
|
|
585
|
+
lines = text.split('\n')
|
|
586
|
+
for i, line in enumerate(lines):
|
|
587
|
+
stripped = line.strip()
|
|
588
|
+
if stripped and len(stripped) < 80 and stripped[-1] not in '.!?:,;':
|
|
589
|
+
if re.match(r'^[A-Za-z0-9]', stripped):
|
|
590
|
+
lines[i] = stripped + '.'
|
|
591
|
+
text = '\n'.join(lines)
|
|
592
|
+
|
|
593
|
+
# Strip markdown formatting
|
|
594
|
+
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
|
595
|
+
text = re.sub(r'\*([^*]+)\*', r'\1', text)
|
|
596
|
+
text = re.sub(r'__([^_]+)__', r'\1', text)
|
|
597
|
+
text = re.sub(r'_([^_]+)_', r'\1', text)
|
|
598
|
+
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
|
599
|
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
600
|
+
text = re.sub(r'https?://\S+', '', text)
|
|
601
|
+
text = re.sub(r'/[\w/.-]+', '', text)
|
|
602
|
+
|
|
603
|
+
# Expand acronyms to speakable form
|
|
604
|
+
acronyms = {
|
|
605
|
+
'API': 'api', 'HTML': 'html', 'CSS': 'css', 'JSON': 'jason',
|
|
606
|
+
'HTTP': 'http', 'HTTPS': 'https', 'URL': 'url', 'TTS': 'text to speech',
|
|
607
|
+
'STT': 'speech to text', 'LLM': 'large language model', 'AI': 'A.I.',
|
|
608
|
+
'UI': 'user interface', 'UX': 'user experience', 'RAM': 'ram',
|
|
609
|
+
'CPU': 'cpu', 'GPU': 'gpu', 'DB': 'database', 'VPS': 'server',
|
|
610
|
+
'SSH': 'ssh', 'CLI': 'command line', 'SDK': 'sdk', 'API': 'api',
|
|
611
|
+
}
|
|
612
|
+
for acronym, expansion in acronyms.items():
|
|
613
|
+
text = re.sub(r'\b' + acronym + r'\b', expansion, text)
|
|
614
|
+
|
|
615
|
+
# Replace symbols with spoken equivalents
|
|
616
|
+
text = text.replace('&', ' and ')
|
|
617
|
+
text = text.replace('%', ' percent ')
|
|
618
|
+
text = text.replace('$', ' dollars ')
|
|
619
|
+
text = text.replace('@', ' at ')
|
|
620
|
+
text = text.replace('#', ' number ')
|
|
621
|
+
text = text.replace('+', ' plus ')
|
|
622
|
+
text = text.replace('=', ' equals ')
|
|
623
|
+
|
|
624
|
+
# Clean up whitespace
|
|
625
|
+
text = re.sub(r'\n+', '. ', text)
|
|
626
|
+
text = re.sub(r'\.{2,}', '.', text)
|
|
627
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
628
|
+
text = re.sub(r'\.\s*\.', '.', text)
|
|
629
|
+
# Strip leading punctuation/spaces (e.g. from [MUSIC_STOP]\n\n → ". text")
|
|
630
|
+
text = re.sub(r'^[.,;:\s]+', '', text)
|
|
631
|
+
|
|
632
|
+
return text
|
|
633
|
+
|
|
634
|
+
# ---------------------------------------------------------------------------
|
|
635
|
+
# Helper: legacy Supertonic voice accessor
|
|
636
|
+
# ---------------------------------------------------------------------------
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def get_supertonic_for_voice(voice_style: str):
|
|
640
|
+
"""Get Supertonic provider (voice_style ignored — unified provider)."""
|
|
641
|
+
return get_provider('supertonic')
|
|
642
|
+
|
|
643
|
+
# ---------------------------------------------------------------------------
|
|
644
|
+
# Blueprint
|
|
645
|
+
# ---------------------------------------------------------------------------
|
|
646
|
+
|
|
647
|
+
conversation_bp = Blueprint('conversation', __name__)
|
|
648
|
+
|
|
649
|
+
# ---------------------------------------------------------------------------
|
|
650
|
+
# POST /api/conversation — main voice conversation endpoint
|
|
651
|
+
# ---------------------------------------------------------------------------
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
@conversation_bp.route('/api/conversation', methods=['POST'])
|
|
655
|
+
def conversation():
|
|
656
|
+
"""
|
|
657
|
+
Handle voice conversation flow.
|
|
658
|
+
|
|
659
|
+
Request JSON:
|
|
660
|
+
message : str — transcribed user speech (required)
|
|
661
|
+
tts_provider : str — 'supertonic' | 'groq' (default: env DEFAULT_TTS_PROVIDER or groq)
|
|
662
|
+
voice : str — voice ID, e.g. 'M1' (default: M1)
|
|
663
|
+
session_id : str — session identifier (default: default)
|
|
664
|
+
ui_context : dict — canvas/music state from frontend (optional)
|
|
665
|
+
|
|
666
|
+
Response JSON (non-streaming):
|
|
667
|
+
response : str — AI text response
|
|
668
|
+
audio : str — base64-encoded audio (if TTS succeeds)
|
|
669
|
+
timing : dict — handshake/llm/tts/total ms
|
|
670
|
+
actions : list — Gateway tool/lifecycle events (optional)
|
|
671
|
+
"""
|
|
672
|
+
try:
|
|
673
|
+
return _conversation_inner()
|
|
674
|
+
except Exception as e:
|
|
675
|
+
import traceback
|
|
676
|
+
tb = traceback.format_exc()
|
|
677
|
+
logger.error(f'FATAL: {tb}')
|
|
678
|
+
return jsonify({
|
|
679
|
+
'response': 'Something went wrong on my end. Try again?',
|
|
680
|
+
'error': 'Internal server error'
|
|
681
|
+
}), 500
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _conversation_inner():
|
|
685
|
+
global _consecutive_empty_responses
|
|
686
|
+
|
|
687
|
+
t_request_start = time.time()
|
|
688
|
+
metrics = {
|
|
689
|
+
'profile': 'gateway',
|
|
690
|
+
'model': 'glm-4.7-flash',
|
|
691
|
+
'tts_success': 1,
|
|
692
|
+
'fallback_used': 0,
|
|
693
|
+
'tool_count': 0,
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
data = request.get_json()
|
|
697
|
+
if not data:
|
|
698
|
+
logger.error('ERROR: No JSON data in request')
|
|
699
|
+
return jsonify({'error': 'No JSON data provided'}), 400
|
|
700
|
+
|
|
701
|
+
logger.info(f'Received conversation request: {data}')
|
|
702
|
+
|
|
703
|
+
user_message = data.get('message', '').strip()
|
|
704
|
+
tts_provider = data.get('tts_provider') or os.getenv('DEFAULT_TTS_PROVIDER', 'groq')
|
|
705
|
+
voice = data.get('voice', 'M1')
|
|
706
|
+
session_id = data.get('session_id', 'default')
|
|
707
|
+
ui_context = data.get('ui_context', {})
|
|
708
|
+
identified_person = data.get('identified_person') or None
|
|
709
|
+
agent_id = data.get('agent_id') or None # e.g. 'default'; None = default 'main'
|
|
710
|
+
gateway_id = data.get('gateway_id') or None # plugin gateway id; None = 'openclaw'
|
|
711
|
+
max_response_chars = data.get('max_response_chars') or None # profile cap, truncates at sentence boundary
|
|
712
|
+
image_path = data.get('image_path') or None # uploaded image for vision analysis
|
|
713
|
+
metrics['session_id'] = session_id
|
|
714
|
+
metrics['user_message_len'] = len(user_message)
|
|
715
|
+
metrics['tts_provider'] = tts_provider
|
|
716
|
+
|
|
717
|
+
if not user_message:
|
|
718
|
+
return jsonify({'error': 'No message provided'}), 400
|
|
719
|
+
|
|
720
|
+
# Filter garbage STT fragments — punctuation-only, single short words, noise
|
|
721
|
+
import re as _re
|
|
722
|
+
_meaningful_chars = _re.sub(r'[^a-zA-Z0-9]', '', user_message)
|
|
723
|
+
if len(_meaningful_chars) < 3:
|
|
724
|
+
logger.info(f'### FILTERED garbage STT: "{user_message}" ({len(_meaningful_chars)} meaningful chars)')
|
|
725
|
+
# Return a no-op stream that ends cleanly — no fallback message shown
|
|
726
|
+
def _noop_stream():
|
|
727
|
+
yield "data: " + json.dumps({"type": "filtered", "reason": "garbage_stt"}) + "\n\n"
|
|
728
|
+
yield "data: " + json.dumps({"type": "text_done", "response": " "}) + "\n\n"
|
|
729
|
+
return Response(_noop_stream(), mimetype='text/event-stream')
|
|
730
|
+
|
|
731
|
+
# Input length guard (P7-T3 security audit)
|
|
732
|
+
if len(user_message) > 4000:
|
|
733
|
+
return jsonify({'error': 'Message too long (max 4000 characters)'}), 400
|
|
734
|
+
|
|
735
|
+
wants_stream = (
|
|
736
|
+
request.args.get('stream') == '1'
|
|
737
|
+
or request.headers.get('X-Stream-Response') == '1'
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Update canvas context from UI state
|
|
741
|
+
if ui_context.get('canvasDisplayed'):
|
|
742
|
+
update_canvas_context(
|
|
743
|
+
ui_context['canvasDisplayed'],
|
|
744
|
+
title=ui_context['canvasDisplayed']
|
|
745
|
+
.replace('/pages/', '')
|
|
746
|
+
.replace('.html', '')
|
|
747
|
+
.replace('-', ' ')
|
|
748
|
+
.title()
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Build context prefix from UI state
|
|
752
|
+
t_context_start = time.time()
|
|
753
|
+
context_prefix = ''
|
|
754
|
+
context_parts = []
|
|
755
|
+
|
|
756
|
+
# Inject face recognition identity
|
|
757
|
+
if identified_person and identified_person.get('name') and identified_person.get('name') != 'unknown':
|
|
758
|
+
name = identified_person['name']
|
|
759
|
+
confidence = identified_person.get('confidence', 0)
|
|
760
|
+
context_parts.append(
|
|
761
|
+
f'[FACE RECOGNITION: The person you are speaking with has been identified as {name} '
|
|
762
|
+
f'({confidence}% confidence). Address them by name naturally.]'
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
# Vision: if user asks about what the camera sees, call vision model with latest frame
|
|
766
|
+
if _is_vision_request(user_message):
|
|
767
|
+
from routes.vision import _latest_frame, _call_vision
|
|
768
|
+
_frame_img = _latest_frame.get('image')
|
|
769
|
+
_frame_age = time.time() - _latest_frame.get('ts', 0)
|
|
770
|
+
if _frame_img and _frame_age < _VISION_FRAME_MAX_AGE:
|
|
771
|
+
try:
|
|
772
|
+
_vision_desc = _call_vision(
|
|
773
|
+
_frame_img,
|
|
774
|
+
'Describe what you see in this image concisely. Focus on people, objects, and actions.',
|
|
775
|
+
)
|
|
776
|
+
context_parts.append(f'[CAMERA VISION: {_vision_desc}]')
|
|
777
|
+
except Exception as exc:
|
|
778
|
+
logger.warning('Vision analysis failed: %s', exc)
|
|
779
|
+
context_parts.append('[CAMERA VISION: Camera is on but vision analysis failed.]')
|
|
780
|
+
elif not _frame_img:
|
|
781
|
+
context_parts.append('[CAMERA VISION: No camera frame available — camera may be off.]')
|
|
782
|
+
else:
|
|
783
|
+
context_parts.append('[CAMERA VISION: Camera frame is stale — camera may have been turned off.]')
|
|
784
|
+
|
|
785
|
+
# Vision: if user uploaded an image, analyze it with vision model
|
|
786
|
+
if image_path:
|
|
787
|
+
try:
|
|
788
|
+
_img_file = Path(image_path).resolve()
|
|
789
|
+
# Security: only allow files inside uploads/ directories
|
|
790
|
+
if 'uploads' not in _img_file.parts:
|
|
791
|
+
raise ValueError(f'Path traversal blocked: {image_path}')
|
|
792
|
+
if _img_file.is_file() and _img_file.stat().st_size < 20_000_000: # 20MB safety cap
|
|
793
|
+
from routes.vision import _call_vision
|
|
794
|
+
_img_b64 = base64.b64encode(_img_file.read_bytes()).decode('ascii')
|
|
795
|
+
_upload_desc = _call_vision(
|
|
796
|
+
_img_b64,
|
|
797
|
+
'Describe what you see in this image in detail. Include colors, objects, text, people, layout, and any notable features.',
|
|
798
|
+
)
|
|
799
|
+
context_parts.append(f'[UPLOADED IMAGE ANALYSIS: {_upload_desc}]')
|
|
800
|
+
logger.info('Vision analysis of uploaded image succeeded (%d bytes)', _img_file.stat().st_size)
|
|
801
|
+
else:
|
|
802
|
+
logger.warning('Uploaded image not found or too large: %s', image_path)
|
|
803
|
+
context_parts.append('[UPLOADED IMAGE: File could not be analyzed — may be too large or missing.]')
|
|
804
|
+
except Exception as exc:
|
|
805
|
+
logger.warning('Vision analysis of uploaded image failed: %s', exc)
|
|
806
|
+
context_parts.append('[UPLOADED IMAGE: Vision analysis failed — the image was uploaded but could not be analyzed.]')
|
|
807
|
+
|
|
808
|
+
if ui_context:
|
|
809
|
+
# Canvas state
|
|
810
|
+
if ui_context.get('canvasVisible') and ui_context.get('canvasDisplayed'):
|
|
811
|
+
page_name = (ui_context['canvasDisplayed']
|
|
812
|
+
.replace('/pages/', '')
|
|
813
|
+
.replace('.html', '')
|
|
814
|
+
.replace('-', ' '))
|
|
815
|
+
context_parts.append(f'[Canvas OPEN: {page_name}]')
|
|
816
|
+
elif not ui_context.get('canvasVisible'):
|
|
817
|
+
context_parts.append('[Canvas CLOSED]')
|
|
818
|
+
if ui_context.get('canvasMenuOpen'):
|
|
819
|
+
context_parts.append('[Canvas menu visible to user]')
|
|
820
|
+
# Canvas JS errors — auto-injected from browser error buffer
|
|
821
|
+
canvas_errors = ui_context.get('canvasErrors', [])
|
|
822
|
+
if canvas_errors:
|
|
823
|
+
err_str = ' | '.join(canvas_errors)
|
|
824
|
+
context_parts.append(f'[Canvas JS Errors: {err_str}]')
|
|
825
|
+
|
|
826
|
+
# Music state (server-side is authoritative)
|
|
827
|
+
_srv_track = _music_state.get('current_track')
|
|
828
|
+
_srv_playing = _music_state.get('playing', False)
|
|
829
|
+
if _srv_playing and _srv_track:
|
|
830
|
+
_track_name = _srv_track.get('title') or _srv_track.get('name', 'unknown')
|
|
831
|
+
context_parts.append(f'[Music PLAYING: {_track_name}]')
|
|
832
|
+
elif _srv_track:
|
|
833
|
+
_track_name = _srv_track.get('title') or _srv_track.get('name', 'unknown')
|
|
834
|
+
context_parts.append(f'[Music PAUSED/STOPPED — last track: {_track_name}]')
|
|
835
|
+
elif ui_context.get('musicPlaying'):
|
|
836
|
+
track = ui_context.get('musicTrack', 'unknown')
|
|
837
|
+
context_parts.append(f'[Music PLAYING: {track}]')
|
|
838
|
+
|
|
839
|
+
# Available music tracks (so agent can use [MUSIC_PLAY:exact name])
|
|
840
|
+
try:
|
|
841
|
+
from routes.music import get_music_files
|
|
842
|
+
_lib_tracks = get_music_files('library')
|
|
843
|
+
_gen_tracks = get_music_files('generated')
|
|
844
|
+
_lib_names = [t.get('title') or t.get('name', '') for t in _lib_tracks]
|
|
845
|
+
_gen_names = [t.get('title') or t.get('name', '') for t in _gen_tracks]
|
|
846
|
+
_lib_names = [n for n in _lib_names if n]
|
|
847
|
+
_gen_names = [n for n in _gen_names if n]
|
|
848
|
+
_parts = []
|
|
849
|
+
if _lib_names:
|
|
850
|
+
_parts.append(f'Library ({len(_lib_names)}): {_cap_list(_lib_names, max_chars=2000)}')
|
|
851
|
+
if _gen_names:
|
|
852
|
+
_parts.append(f'Generated ({len(_gen_names)}): {_cap_list(_gen_names, max_chars=2000)}')
|
|
853
|
+
if _parts:
|
|
854
|
+
context_parts.append(f'[Available tracks — {" | ".join(_parts)}]')
|
|
855
|
+
except Exception:
|
|
856
|
+
pass
|
|
857
|
+
|
|
858
|
+
# Recently completed Suno generations — agent gets notified on next turn
|
|
859
|
+
try:
|
|
860
|
+
from routes.suno import completed_songs_queue
|
|
861
|
+
if completed_songs_queue:
|
|
862
|
+
_pending = completed_songs_queue[-3:]
|
|
863
|
+
_titles = [s.get('title', 'Unknown Track') for s in _pending]
|
|
864
|
+
context_parts.append(f'[Suno just finished: {", ".join(repr(t) for t in _titles)} — now ready in Generated playlist]')
|
|
865
|
+
except Exception:
|
|
866
|
+
pass
|
|
867
|
+
|
|
868
|
+
# Available canvas pages (agent needs IDs for [CANVAS:page-id])
|
|
869
|
+
try:
|
|
870
|
+
from routes.canvas import load_canvas_manifest
|
|
871
|
+
_manifest = load_canvas_manifest()
|
|
872
|
+
_page_ids = sorted(_manifest.get('pages', {}).keys())
|
|
873
|
+
_page_list = _cap_list(_page_ids, max_chars=1000)
|
|
874
|
+
except Exception:
|
|
875
|
+
_page_list = 'unknown'
|
|
876
|
+
context_parts.append(f'[Canvas pages: {_page_list}]')
|
|
877
|
+
|
|
878
|
+
# Available DJ sounds (for [SOUND:name] in DJ mode)
|
|
879
|
+
context_parts.append(
|
|
880
|
+
'[DJ sounds: air_horn, scratch_long, rewind, record_stop, '
|
|
881
|
+
'crowd_cheer, crowd_hype, yeah, lets_go, gunshot, bruh, sad_trombone]'
|
|
882
|
+
)
|
|
883
|
+
# Inject active profile's custom system_prompt (admin editor → runtime)
|
|
884
|
+
# Also read min_sentence_chars for TTS sentence extraction.
|
|
885
|
+
_min_sentence_chars = 40 # default — prevents choppy short TTS fragments
|
|
886
|
+
try:
|
|
887
|
+
from profiles.manager import get_profile_manager
|
|
888
|
+
from routes.profiles import _active_profile_id
|
|
889
|
+
_mgr = get_profile_manager()
|
|
890
|
+
_prof = _mgr.get_profile(_active_profile_id)
|
|
891
|
+
if _prof and _prof.system_prompt and _prof.system_prompt.strip():
|
|
892
|
+
context_parts.append(f'[PROFILE INSTRUCTIONS: {_prof.system_prompt.strip()}]')
|
|
893
|
+
if _prof and hasattr(_prof, 'voice') and _prof.voice and _prof.voice.min_sentence_chars:
|
|
894
|
+
_min_sentence_chars = _prof.voice.min_sentence_chars
|
|
895
|
+
except Exception:
|
|
896
|
+
pass # Profile system not available — skip gracefully
|
|
897
|
+
|
|
898
|
+
# Inject voice assistant instructions so the agent knows about action tags.
|
|
899
|
+
# This must be in-app (not workspace files) so it works out of the box.
|
|
900
|
+
context_parts.append(_load_voice_system_prompt())
|
|
901
|
+
|
|
902
|
+
if context_parts:
|
|
903
|
+
context_prefix = ' '.join(context_parts) + ' '
|
|
904
|
+
|
|
905
|
+
t_context_ms = int((time.time() - t_context_start) * 1000)
|
|
906
|
+
if t_context_ms > 50:
|
|
907
|
+
logger.info(f"### CONTEXT BUILD TIMING: {t_context_ms}ms ({len(context_parts)} parts, {len(context_prefix)} chars)")
|
|
908
|
+
|
|
909
|
+
log_conversation('user', user_message, session_id=session_id,
|
|
910
|
+
tts_provider=tts_provider, voice=voice)
|
|
911
|
+
|
|
912
|
+
# Replace the legacy __session_start__ sentinel with a natural-language greeting
|
|
913
|
+
# prompt so the LLM produces a real greeting instead of a system sentinel ("NO").
|
|
914
|
+
# user_message is kept as-is so the sentinel suppression logic still works.
|
|
915
|
+
if user_message == '__session_start__':
|
|
916
|
+
_face = identified_person or {}
|
|
917
|
+
_face_name = _face.get('name', '') if _face.get('name', '') != 'unknown' else ''
|
|
918
|
+
if _face_name:
|
|
919
|
+
_gateway_message = (
|
|
920
|
+
f'A new voice session has just started. The person in front of the camera '
|
|
921
|
+
f'has been identified as {_face_name}. Greet them by name — '
|
|
922
|
+
f'one brief, friendly sentence.'
|
|
923
|
+
)
|
|
924
|
+
else:
|
|
925
|
+
_gateway_message = (
|
|
926
|
+
'A new voice session has just started. Give a brief, friendly one-sentence greeting. '
|
|
927
|
+
'Do NOT address anyone by name — no face has been recognized and you do not know who is speaking.'
|
|
928
|
+
)
|
|
929
|
+
elif user_message.startswith('__suno_complete__:'):
|
|
930
|
+
_song_title = user_message[len('__suno_complete__:'):].strip() or 'your track'
|
|
931
|
+
_gateway_message = (
|
|
932
|
+
f'The Suno song "{_song_title}" just finished generating and is now ready in the music player. '
|
|
933
|
+
f'Let the user know in one brief, friendly sentence and offer to play it for them.'
|
|
934
|
+
)
|
|
935
|
+
else:
|
|
936
|
+
_gateway_message = user_message
|
|
937
|
+
message_with_context = context_prefix + _gateway_message if context_prefix else _gateway_message
|
|
938
|
+
ai_response = None
|
|
939
|
+
captured_actions = []
|
|
940
|
+
|
|
941
|
+
# ── PRIMARY PATH: Gateway (routed by gateway_id from request/profile) ──
|
|
942
|
+
if gateway_manager.is_configured():
|
|
943
|
+
try:
|
|
944
|
+
logger.info('### Starting Gateway connection...')
|
|
945
|
+
event_queue: queue.Queue = queue.Queue()
|
|
946
|
+
_session_key = get_voice_session_key()
|
|
947
|
+
|
|
948
|
+
# Check if gateway recently reconnected after a failure —
|
|
949
|
+
# inject a system note so the agent acknowledges the interruption
|
|
950
|
+
_recovery_prefix = ''
|
|
951
|
+
try:
|
|
952
|
+
_gw = gateway_manager.get(gateway_id)
|
|
953
|
+
if _gw and hasattr(_gw, 'consume_reconnection') and _gw.consume_reconnection():
|
|
954
|
+
_recovery_prefix = (
|
|
955
|
+
'[SYSTEM: The connection was briefly interrupted (server restart). '
|
|
956
|
+
'Briefly acknowledge this to the user before responding to their message.]\n\n'
|
|
957
|
+
)
|
|
958
|
+
logger.info('### Injecting recovery prefix into message')
|
|
959
|
+
except Exception:
|
|
960
|
+
pass
|
|
961
|
+
|
|
962
|
+
def _run_gateway():
|
|
963
|
+
_msg = _recovery_prefix + message_with_context if _recovery_prefix else message_with_context
|
|
964
|
+
gateway_manager.stream_to_queue(
|
|
965
|
+
event_queue, _msg, _session_key, captured_actions,
|
|
966
|
+
gateway_id=gateway_id,
|
|
967
|
+
agent_id=agent_id,
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
t_llm_start = time.time()
|
|
971
|
+
gw_thread = threading.Thread(target=_run_gateway, daemon=True)
|
|
972
|
+
gw_thread.start()
|
|
973
|
+
|
|
974
|
+
if wants_stream:
|
|
975
|
+
# ── STREAMING MODE ────────────────────────────────────────
|
|
976
|
+
def stream_response():
|
|
977
|
+
nonlocal ai_response, event_queue, t_llm_start
|
|
978
|
+
|
|
979
|
+
# ── TTS helpers ───────────────────────────────────────
|
|
980
|
+
try:
|
|
981
|
+
_prov = get_provider(tts_provider)
|
|
982
|
+
_audio_fmt = _prov.get_info().get('audio_format', 'wav')
|
|
983
|
+
except Exception:
|
|
984
|
+
_audio_fmt = 'wav'
|
|
985
|
+
|
|
986
|
+
def _tts_error_event(err_str):
|
|
987
|
+
code_match = re.search(r'\[groq:([^\]]+)\]', err_str)
|
|
988
|
+
err_code = code_match.group(1) if code_match else 'unknown'
|
|
989
|
+
REASONS = {
|
|
990
|
+
'model_terms_required': ('terms', 'Accept Orpheus terms at console.groq.com'),
|
|
991
|
+
'rate_limit_exceeded': ('rate_limit', 'Groq rate limit hit — try again shortly'),
|
|
992
|
+
'insufficient_quota': ('no_credits', 'Groq account out of credits'),
|
|
993
|
+
'invalid_api_key': ('bad_key', 'Invalid GROQ_API_KEY'),
|
|
994
|
+
'unknown': ('error', err_str),
|
|
995
|
+
}
|
|
996
|
+
reason_key, reason_msg = REASONS.get(err_code, ('error', err_str))
|
|
997
|
+
return json.dumps({
|
|
998
|
+
'type': 'tts_error',
|
|
999
|
+
'provider': tts_provider,
|
|
1000
|
+
'reason': reason_key,
|
|
1001
|
+
'error': reason_msg,
|
|
1002
|
+
}) + '\n'
|
|
1003
|
+
|
|
1004
|
+
# ── Mid-stream TTS helpers ────────────────────────────
|
|
1005
|
+
def _has_open_tag(text):
|
|
1006
|
+
"""True while inside an incomplete [...] action tag or open code fence."""
|
|
1007
|
+
if text.count('[') > text.count(']'):
|
|
1008
|
+
return True
|
|
1009
|
+
# Odd number of ``` markers means we're inside a code block
|
|
1010
|
+
if text.count('```') % 2 != 0:
|
|
1011
|
+
return True
|
|
1012
|
+
return False
|
|
1013
|
+
|
|
1014
|
+
def _extract_sentence(text, min_len=40):
|
|
1015
|
+
"""Return (sentence, remainder) at first sentence boundary
|
|
1016
|
+
that falls at or after min_len chars. Skips boundaries that
|
|
1017
|
+
are likely inside abbreviations (e.g. A.I., Mr.)."""
|
|
1018
|
+
if len(text) < min_len:
|
|
1019
|
+
return None, text
|
|
1020
|
+
for match in re.finditer(r'[.!?](?= |\Z)', text):
|
|
1021
|
+
end = match.end()
|
|
1022
|
+
if end >= min_len:
|
|
1023
|
+
return text[:end].strip(), text[end:].lstrip()
|
|
1024
|
+
return None, text
|
|
1025
|
+
|
|
1026
|
+
def _fire_tts(raw_text):
|
|
1027
|
+
"""Start TTS for raw_text in background. Returns (done_event, result)."""
|
|
1028
|
+
done = threading.Event()
|
|
1029
|
+
result = {'audio': None, 'error': None}
|
|
1030
|
+
def _run():
|
|
1031
|
+
try:
|
|
1032
|
+
t0 = time.time()
|
|
1033
|
+
cleaned = clean_for_tts(raw_text)
|
|
1034
|
+
t_clean = time.time()
|
|
1035
|
+
if cleaned and cleaned.strip():
|
|
1036
|
+
result['audio'] = _tts_generate_b64(
|
|
1037
|
+
cleaned, voice=voice or 'M1',
|
|
1038
|
+
tts_provider=tts_provider
|
|
1039
|
+
)
|
|
1040
|
+
t_done = time.time()
|
|
1041
|
+
logger.info(
|
|
1042
|
+
f"### TTS TIMING: clean={int((t_clean-t0)*1000)}ms "
|
|
1043
|
+
f"generate={int((t_done-t_clean)*1000)}ms "
|
|
1044
|
+
f"total={int((t_done-t0)*1000)}ms "
|
|
1045
|
+
f"text={len(cleaned or '')} chars"
|
|
1046
|
+
)
|
|
1047
|
+
except Exception as e:
|
|
1048
|
+
result['error'] = str(e)
|
|
1049
|
+
finally:
|
|
1050
|
+
done.set()
|
|
1051
|
+
threading.Thread(target=_run, daemon=True).start()
|
|
1052
|
+
return done, result
|
|
1053
|
+
|
|
1054
|
+
# Mid-stream TTS state
|
|
1055
|
+
_tts_buf = '' # raw incremental text buffer
|
|
1056
|
+
_tts_pending = [] # [(done_event, result_dict), ...]
|
|
1057
|
+
_chunks_sent = 0 # audio chunks already yielded early
|
|
1058
|
+
|
|
1059
|
+
full_response = None
|
|
1060
|
+
_stream_start = time.time()
|
|
1061
|
+
_STREAM_HARD_TIMEOUT = 310 # seconds — total allowed time
|
|
1062
|
+
_QUEUE_POLL_INTERVAL = 10 # seconds — yield heartbeat if no events
|
|
1063
|
+
while True:
|
|
1064
|
+
try:
|
|
1065
|
+
evt = event_queue.get(timeout=_QUEUE_POLL_INTERVAL)
|
|
1066
|
+
except queue.Empty:
|
|
1067
|
+
# No events for _QUEUE_POLL_INTERVAL seconds.
|
|
1068
|
+
# Yield a heartbeat to keep the browser/Cloudflare
|
|
1069
|
+
# connection alive (they time out at 60-100s of silence).
|
|
1070
|
+
elapsed = int(time.time() - _stream_start)
|
|
1071
|
+
if elapsed > _STREAM_HARD_TIMEOUT:
|
|
1072
|
+
yield json.dumps({'type': 'error', 'error': 'Gateway timeout'}) + '\n'
|
|
1073
|
+
break
|
|
1074
|
+
yield json.dumps({'type': 'heartbeat', 'elapsed': elapsed}) + '\n'
|
|
1075
|
+
continue
|
|
1076
|
+
|
|
1077
|
+
if evt['type'] == 'handshake':
|
|
1078
|
+
metrics['handshake_ms'] = evt['ms']
|
|
1079
|
+
continue
|
|
1080
|
+
|
|
1081
|
+
if evt['type'] == 'heartbeat':
|
|
1082
|
+
logger.info(f"### HEARTBEAT → browser ({evt.get('elapsed', 0)}s)")
|
|
1083
|
+
yield json.dumps({'type': 'heartbeat', 'elapsed': evt.get('elapsed', 0)}) + '\n'
|
|
1084
|
+
# Flush any TTS that finished during tool execution —
|
|
1085
|
+
# without this, audio sits in _tts_pending for the
|
|
1086
|
+
# entire duration of tool calls (30-60s+ silence).
|
|
1087
|
+
while _tts_pending and _tts_pending[0][0].is_set():
|
|
1088
|
+
_done_evt, _res = _tts_pending.pop(0)
|
|
1089
|
+
if _res.get('error'):
|
|
1090
|
+
yield _tts_error_event(_res['error'])
|
|
1091
|
+
elif _res.get('audio'):
|
|
1092
|
+
yield json.dumps({
|
|
1093
|
+
'type': 'audio',
|
|
1094
|
+
'audio': _res['audio'],
|
|
1095
|
+
'audio_format': _audio_fmt,
|
|
1096
|
+
'chunk': _chunks_sent,
|
|
1097
|
+
'total_chunks': None,
|
|
1098
|
+
'timing': {
|
|
1099
|
+
'tts_ms': 0,
|
|
1100
|
+
'total_ms': int((time.time() - t_request_start) * 1000),
|
|
1101
|
+
},
|
|
1102
|
+
}) + '\n'
|
|
1103
|
+
_chunks_sent += 1
|
|
1104
|
+
continue
|
|
1105
|
+
|
|
1106
|
+
if evt['type'] == 'delta':
|
|
1107
|
+
_tts_buf += evt['text']
|
|
1108
|
+
# Don't fire TTS if buffer looks like a system response
|
|
1109
|
+
# that will be suppressed at text_done. Wait for final
|
|
1110
|
+
# confirmation before speaking.
|
|
1111
|
+
_buf_stripped = _tts_buf.strip()
|
|
1112
|
+
# Suppress system responses — uses regex from compat layer
|
|
1113
|
+
# plus partial match for mid-stream detection
|
|
1114
|
+
_is_system_text = (
|
|
1115
|
+
is_system_response(_buf_stripped)
|
|
1116
|
+
or _buf_stripped.upper().startswith('HEARTBEAT')
|
|
1117
|
+
)
|
|
1118
|
+
# Fire TTS for complete sentences as they arrive
|
|
1119
|
+
if not _is_system_text and not _has_open_tag(_tts_buf):
|
|
1120
|
+
sentence, _tts_buf = _extract_sentence(_tts_buf, min_len=_min_sentence_chars)
|
|
1121
|
+
if sentence:
|
|
1122
|
+
logger.info(f"### TTS sentence (streaming): {sentence[:80]}")
|
|
1123
|
+
_tts_pending.append(_fire_tts(sentence))
|
|
1124
|
+
yield json.dumps({'type': 'delta', 'text': evt['text']}) + '\n'
|
|
1125
|
+
# Flush any TTS chunks that finished while text was streaming —
|
|
1126
|
+
# play audio as soon as it's ready instead of waiting for text_done
|
|
1127
|
+
while _tts_pending and _tts_pending[0][0].is_set():
|
|
1128
|
+
_done_evt, _res = _tts_pending.pop(0)
|
|
1129
|
+
if _res.get('error'):
|
|
1130
|
+
yield _tts_error_event(_res['error'])
|
|
1131
|
+
elif _res.get('audio'):
|
|
1132
|
+
yield json.dumps({
|
|
1133
|
+
'type': 'audio',
|
|
1134
|
+
'audio': _res['audio'],
|
|
1135
|
+
'audio_format': _audio_fmt,
|
|
1136
|
+
'chunk': _chunks_sent,
|
|
1137
|
+
'total_chunks': None,
|
|
1138
|
+
'timing': {
|
|
1139
|
+
'tts_ms': 0,
|
|
1140
|
+
'total_ms': int((time.time() - t_request_start) * 1000),
|
|
1141
|
+
},
|
|
1142
|
+
}) + '\n'
|
|
1143
|
+
_chunks_sent += 1
|
|
1144
|
+
continue
|
|
1145
|
+
|
|
1146
|
+
if evt['type'] == 'action':
|
|
1147
|
+
# Flush any TTS chunks that already finished —
|
|
1148
|
+
# avoids silence during long tool calls (the first
|
|
1149
|
+
# sentence TTS completes ~1s in but would otherwise
|
|
1150
|
+
# wait until text_done which can be minutes away).
|
|
1151
|
+
while _tts_pending and _tts_pending[0][0].is_set():
|
|
1152
|
+
_done_evt, _res = _tts_pending.pop(0)
|
|
1153
|
+
if _res.get('error'):
|
|
1154
|
+
yield _tts_error_event(_res['error'])
|
|
1155
|
+
elif _res.get('audio'):
|
|
1156
|
+
yield json.dumps({
|
|
1157
|
+
'type': 'audio',
|
|
1158
|
+
'audio': _res['audio'],
|
|
1159
|
+
'audio_format': _audio_fmt,
|
|
1160
|
+
'chunk': _chunks_sent,
|
|
1161
|
+
'total_chunks': None,
|
|
1162
|
+
'timing': {
|
|
1163
|
+
'tts_ms': 0,
|
|
1164
|
+
'total_ms': int((time.time() - t_request_start) * 1000),
|
|
1165
|
+
},
|
|
1166
|
+
}) + '\n'
|
|
1167
|
+
_chunks_sent += 1
|
|
1168
|
+
yield json.dumps({'type': 'action', 'action': evt['action']}) + '\n'
|
|
1169
|
+
continue
|
|
1170
|
+
|
|
1171
|
+
if evt['type'] == 'queued':
|
|
1172
|
+
StatusModule_hack = True # just yield to browser
|
|
1173
|
+
yield json.dumps({'type': 'queued'}) + '\n'
|
|
1174
|
+
continue
|
|
1175
|
+
|
|
1176
|
+
if evt['type'] == 'text_done':
|
|
1177
|
+
logger.info(f"### TEXT_DONE received. response={len(evt.get('response', '') or '')} chars, _tts_pending={len(_tts_pending)}, _tts_buf={repr(_tts_buf[:80])}")
|
|
1178
|
+
# Handle LLM/gateway errors with a spoken fallback
|
|
1179
|
+
if evt.get('error') and not evt.get('response'):
|
|
1180
|
+
error_msg = evt['error']
|
|
1181
|
+
logger.error(f"### GATEWAY ERROR → fallback: {error_msg}")
|
|
1182
|
+
evt['response'] = "One moment, still working on that."
|
|
1183
|
+
metrics['fallback_used'] = 1
|
|
1184
|
+
full_response = evt.get('response')
|
|
1185
|
+
if full_response and max_response_chars:
|
|
1186
|
+
full_response = _truncate_at_sentence(full_response, max_response_chars)
|
|
1187
|
+
|
|
1188
|
+
# Suppress bare NO/YES sentinel responses to system triggers
|
|
1189
|
+
# (gateway returns "NO" for wake-word checks on __session_start__)
|
|
1190
|
+
_is_system_trigger = user_message.startswith('__')
|
|
1191
|
+
if _is_system_trigger and full_response and \
|
|
1192
|
+
full_response.strip().upper() in ('NO', 'NO.', 'YES', 'YES.'):
|
|
1193
|
+
logger.info(f'Suppressing sentinel "{full_response.strip()}" for system trigger')
|
|
1194
|
+
yield json.dumps({'type': 'no_audio'}) + '\n'
|
|
1195
|
+
log_metrics(metrics)
|
|
1196
|
+
break
|
|
1197
|
+
|
|
1198
|
+
# Tag-only response fallback: if the agent responded
|
|
1199
|
+
# with ONLY action tags and no spoken words, prepend
|
|
1200
|
+
# a brief acknowledgment so TTS has something to say.
|
|
1201
|
+
if full_response and re.match(
|
|
1202
|
+
r'^\s*(\[[^\]]+\]\s*)+$', full_response
|
|
1203
|
+
):
|
|
1204
|
+
logger.info(
|
|
1205
|
+
f"### Tag-only response detected, prepending "
|
|
1206
|
+
f"spoken text: {full_response.strip()[:60]}"
|
|
1207
|
+
)
|
|
1208
|
+
full_response = "Here you go. " + full_response
|
|
1209
|
+
|
|
1210
|
+
metrics['llm_inference_ms'] = int((time.time() - t_llm_start) * 1000)
|
|
1211
|
+
metrics['tool_count'] = sum(
|
|
1212
|
+
1 for a in captured_actions
|
|
1213
|
+
if a.get('type') == 'tool' and a.get('phase') == 'start'
|
|
1214
|
+
)
|
|
1215
|
+
metrics['profile'] = 'gateway'
|
|
1216
|
+
metrics['model'] = 'glm-4.7-flash'
|
|
1217
|
+
logger.debug(f"[GW] Gateway response ({len(full_response or '')} chars): {repr((full_response or '')[:300])}")
|
|
1218
|
+
logger.info(
|
|
1219
|
+
f"### LLM inference completed in "
|
|
1220
|
+
f"{metrics['llm_inference_ms']}ms "
|
|
1221
|
+
f"(tools={metrics['tool_count']})"
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
# ── Clear recovery mode on successful gateway response ──
|
|
1225
|
+
if full_response and full_response.strip() and _session_recovery_key is not None:
|
|
1226
|
+
_exit_session_recovery()
|
|
1227
|
+
|
|
1228
|
+
# ── Retry once on instant empty response ──
|
|
1229
|
+
# IMPORTANT: check BEFORE yielding text_done.
|
|
1230
|
+
# If we yield empty text_done first, the client
|
|
1231
|
+
# shows "Sorry" and cancels its reader — the retry
|
|
1232
|
+
# result never reaches it.
|
|
1233
|
+
# Instead: yield {'type':'retrying'} to keep the
|
|
1234
|
+
# client alive, then swap the event queue.
|
|
1235
|
+
_is_empty = not full_response or not full_response.strip()
|
|
1236
|
+
if _is_empty and metrics.get('llm_inference_ms', 9999) < 5000 \
|
|
1237
|
+
and not getattr(stream_response, '_retried', False):
|
|
1238
|
+
stream_response._retried = True
|
|
1239
|
+
logger.warning(
|
|
1240
|
+
f"### EMPTY RESPONSE in {metrics['llm_inference_ms']}ms "
|
|
1241
|
+
f"— retrying once (client kept alive via 'retrying' event)"
|
|
1242
|
+
)
|
|
1243
|
+
# Tell the client to wait — don't show fallback
|
|
1244
|
+
yield json.dumps({'type': 'retrying'}) + '\n'
|
|
1245
|
+
time.sleep(2)
|
|
1246
|
+
# Re-send the same message through the gateway on the same key.
|
|
1247
|
+
# Openclaw removed the orphaned message on the first attempt.
|
|
1248
|
+
# If this is session_start, also clear the session file to eliminate
|
|
1249
|
+
# any further stale state before the retry.
|
|
1250
|
+
# Note: session file clearing moved to host watchdog
|
|
1251
|
+
# (session files are inside openclaw container, not accessible from here)
|
|
1252
|
+
retry_queue = queue.Queue()
|
|
1253
|
+
captured_actions.clear()
|
|
1254
|
+
def _retry_gateway():
|
|
1255
|
+
gateway_manager.stream_to_queue(
|
|
1256
|
+
retry_queue, message_with_context,
|
|
1257
|
+
_session_key, captured_actions,
|
|
1258
|
+
gateway_id=gateway_id,
|
|
1259
|
+
agent_id=agent_id,
|
|
1260
|
+
)
|
|
1261
|
+
retry_thread = threading.Thread(
|
|
1262
|
+
target=_retry_gateway, daemon=True
|
|
1263
|
+
)
|
|
1264
|
+
t_llm_start = time.time()
|
|
1265
|
+
retry_thread.start()
|
|
1266
|
+
event_queue = retry_queue
|
|
1267
|
+
logger.info("### RETRY: re-sent message to gateway")
|
|
1268
|
+
continue # back to event loop — text_done NOT sent yet
|
|
1269
|
+
|
|
1270
|
+
# ── Z.AI direct fallback after double-empty ──
|
|
1271
|
+
if _is_empty and getattr(stream_response, '_retried', False):
|
|
1272
|
+
logger.warning('### DOUBLE EMPTY — session poisoned, entering recovery mode')
|
|
1273
|
+
|
|
1274
|
+
# 1. Switch to recovery session key so NEXT request
|
|
1275
|
+
# goes to a fresh openclaw session (not the poisoned one)
|
|
1276
|
+
_enter_session_recovery()
|
|
1277
|
+
|
|
1278
|
+
# 2. Force-disconnect gateway WS so it reconnects fresh
|
|
1279
|
+
try:
|
|
1280
|
+
_gw = gateway_manager.get(gateway_id)
|
|
1281
|
+
if _gw and hasattr(_gw, 'force_disconnect'):
|
|
1282
|
+
_gw.force_disconnect()
|
|
1283
|
+
logger.warning('### Force-disconnected gateway WS after double-empty')
|
|
1284
|
+
except Exception as _dfe:
|
|
1285
|
+
logger.error(f'### Failed to disconnect gateway: {_dfe}')
|
|
1286
|
+
|
|
1287
|
+
# 3. Write restart flag for host watchdog (background cleanup)
|
|
1288
|
+
try:
|
|
1289
|
+
_flag_path = Path('/app/runtime/uploads/.restart-openclaw.flag')
|
|
1290
|
+
_flag_path.write_text(
|
|
1291
|
+
f'double-empty at {__import__("datetime").datetime.utcnow().isoformat()}Z'
|
|
1292
|
+
)
|
|
1293
|
+
logger.warning('### Wrote .restart-openclaw.flag — watchdog will clean up poisoned session')
|
|
1294
|
+
except Exception as _rfe:
|
|
1295
|
+
logger.error(f'### Failed to write restart flag: {_rfe}')
|
|
1296
|
+
|
|
1297
|
+
# 4. Try Z.AI direct fallback for THIS message
|
|
1298
|
+
try:
|
|
1299
|
+
import requests as _req
|
|
1300
|
+
_zai_key = os.environ.get('ZAI_API_KEY', '')
|
|
1301
|
+
if _zai_key:
|
|
1302
|
+
_zai_resp = _req.post(
|
|
1303
|
+
'https://api.z.ai/api/anthropic/v1/messages',
|
|
1304
|
+
headers={
|
|
1305
|
+
'x-api-key': _zai_key,
|
|
1306
|
+
'anthropic-version': '2023-06-01',
|
|
1307
|
+
'content-type': 'application/json',
|
|
1308
|
+
},
|
|
1309
|
+
json={
|
|
1310
|
+
'model': 'glm-4.7',
|
|
1311
|
+
'max_tokens': 400,
|
|
1312
|
+
'messages': [{'role': 'user', 'content': message_with_context}],
|
|
1313
|
+
},
|
|
1314
|
+
timeout=20,
|
|
1315
|
+
)
|
|
1316
|
+
if _zai_resp.status_code == 200:
|
|
1317
|
+
_zai_data = _zai_resp.json()
|
|
1318
|
+
_zai_text = _zai_data.get('content', [{}])[0].get('text', '')
|
|
1319
|
+
if _zai_text:
|
|
1320
|
+
full_response = _zai_text
|
|
1321
|
+
metrics['fallback_used'] = 1
|
|
1322
|
+
metrics['profile'] = 'zai-direct'
|
|
1323
|
+
logger.info(f'### Z.AI direct fallback succeeded: {len(_zai_text)} chars')
|
|
1324
|
+
except Exception as _zfe:
|
|
1325
|
+
logger.error(f'### Z.AI direct fallback failed: {_zfe}')
|
|
1326
|
+
|
|
1327
|
+
if not full_response or not full_response.strip():
|
|
1328
|
+
full_response = "I had a brief connection issue. I'm reconnecting now — please try again."
|
|
1329
|
+
|
|
1330
|
+
yield json.dumps({
|
|
1331
|
+
'type': 'text_done',
|
|
1332
|
+
'response': full_response,
|
|
1333
|
+
'actions': captured_actions,
|
|
1334
|
+
'timing': {
|
|
1335
|
+
'handshake_ms': metrics.get('handshake_ms'),
|
|
1336
|
+
'llm_ms': metrics.get('llm_inference_ms'),
|
|
1337
|
+
}
|
|
1338
|
+
}) + '\n'
|
|
1339
|
+
|
|
1340
|
+
# Auto-reset removed — loop detection (Phase 1 config)
|
|
1341
|
+
# handles stuck agents; consecutive empties no longer
|
|
1342
|
+
# trigger a session key bump that would cold-cache Z.AI.
|
|
1343
|
+
|
|
1344
|
+
# Handle [SESSION_RESET] trigger from agent
|
|
1345
|
+
if full_response and '[SESSION_RESET]' in full_response:
|
|
1346
|
+
old_key = get_voice_session_key()
|
|
1347
|
+
new_key = bump_voice_session()
|
|
1348
|
+
logger.info(
|
|
1349
|
+
f'### AGENT-TRIGGERED SESSION RESET: {old_key} → {new_key}'
|
|
1350
|
+
)
|
|
1351
|
+
full_response = full_response.replace('[SESSION_RESET]', '').strip()
|
|
1352
|
+
|
|
1353
|
+
# Detect agent returning a bare file path (e.g. from TTS tool use)
|
|
1354
|
+
if full_response and re.match(r'^/tmp/[\w/.-]+$', full_response.strip()):
|
|
1355
|
+
file_path = full_response.strip()
|
|
1356
|
+
logger.warning(f'Agent returned file path — serving directly: {file_path}')
|
|
1357
|
+
try:
|
|
1358
|
+
with open(file_path, 'rb') as f:
|
|
1359
|
+
file_bytes = f.read()
|
|
1360
|
+
audio_b64 = base64.b64encode(file_bytes).decode('utf-8')
|
|
1361
|
+
ext = file_path.rsplit('.', 1)[-1].lower()
|
|
1362
|
+
audio_format = ext if ext in ('mp3', 'wav', 'ogg') else 'mp3'
|
|
1363
|
+
metrics['tts_generation_ms'] = 0
|
|
1364
|
+
metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
|
|
1365
|
+
yield json.dumps({
|
|
1366
|
+
'type': 'audio',
|
|
1367
|
+
'audio': audio_b64,
|
|
1368
|
+
'audio_format': audio_format,
|
|
1369
|
+
'chunk': 0,
|
|
1370
|
+
'timing': {'tts_ms': 0, 'total_ms': metrics.get('total_ms')},
|
|
1371
|
+
}) + '\n'
|
|
1372
|
+
logger.info(f'Served agent-generated audio: {len(file_bytes)} bytes ({audio_format})')
|
|
1373
|
+
except Exception as fp_err:
|
|
1374
|
+
logger.error(f'Failed to serve agent audio file {file_path}: {fp_err}')
|
|
1375
|
+
yield json.dumps({
|
|
1376
|
+
'type': 'tts_error',
|
|
1377
|
+
'provider': 'agent',
|
|
1378
|
+
'reason': 'file_read_error',
|
|
1379
|
+
'error': f'Agent generated audio but file could not be read: {fp_err}',
|
|
1380
|
+
}) + '\n'
|
|
1381
|
+
log_metrics(metrics)
|
|
1382
|
+
break
|
|
1383
|
+
|
|
1384
|
+
# ── Flush TTS buffer + yield audio chunks in order ──
|
|
1385
|
+
metrics['response_len'] = len(full_response) if full_response else 0
|
|
1386
|
+
|
|
1387
|
+
# If response was suppressed (None), discard ALL
|
|
1388
|
+
# pending TTS — never speak suppressed text like
|
|
1389
|
+
# HEARTBEAT_OK that leaked through delta streaming.
|
|
1390
|
+
if not full_response:
|
|
1391
|
+
if _tts_pending:
|
|
1392
|
+
logger.info(
|
|
1393
|
+
f"### Discarding {len(_tts_pending)} TTS "
|
|
1394
|
+
f"chunks for suppressed response"
|
|
1395
|
+
)
|
|
1396
|
+
_tts_buf = ''
|
|
1397
|
+
_tts_pending = []
|
|
1398
|
+
|
|
1399
|
+
# Fire TTS for any remaining buffered text
|
|
1400
|
+
_remaining = _tts_buf.strip()
|
|
1401
|
+
if _remaining:
|
|
1402
|
+
_tts_pending.append(_fire_tts(_remaining))
|
|
1403
|
+
_tts_buf = ''
|
|
1404
|
+
|
|
1405
|
+
# Fallback: no sentences extracted (very short response)
|
|
1406
|
+
if not _tts_pending and full_response:
|
|
1407
|
+
tts_text = clean_for_tts(full_response)
|
|
1408
|
+
if tts_text and tts_text.strip():
|
|
1409
|
+
_tts_pending.append(_fire_tts(tts_text))
|
|
1410
|
+
|
|
1411
|
+
if not _tts_pending:
|
|
1412
|
+
logger.info('Skipping TTS — no speakable text')
|
|
1413
|
+
# Tell the frontend there's no audio coming so it can
|
|
1414
|
+
# reset isProcessing and re-enable the mic.
|
|
1415
|
+
yield json.dumps({'type': 'no_audio'}) + '\n'
|
|
1416
|
+
metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
|
|
1417
|
+
log_metrics(metrics)
|
|
1418
|
+
if full_response:
|
|
1419
|
+
log_conversation('assistant', full_response,
|
|
1420
|
+
session_id=session_id,
|
|
1421
|
+
tts_provider=tts_provider, voice=voice)
|
|
1422
|
+
save_conversation_turn(
|
|
1423
|
+
user_msg=user_message,
|
|
1424
|
+
ai_response=full_response,
|
|
1425
|
+
session_id=session_id,
|
|
1426
|
+
session_key=_session_key,
|
|
1427
|
+
tts_provider=tts_provider,
|
|
1428
|
+
voice=voice,
|
|
1429
|
+
duration_ms=metrics.get('total_ms'),
|
|
1430
|
+
actions=captured_actions,
|
|
1431
|
+
identified_person=identified_person,
|
|
1432
|
+
)
|
|
1433
|
+
break
|
|
1434
|
+
|
|
1435
|
+
t_tts_start = time.time()
|
|
1436
|
+
total_chunks = _chunks_sent + len(_tts_pending)
|
|
1437
|
+
tts_ok = True
|
|
1438
|
+
for i, (done_evt, res) in enumerate(_tts_pending):
|
|
1439
|
+
done_evt.wait(timeout=30)
|
|
1440
|
+
if res['error']:
|
|
1441
|
+
metrics['tts_success'] = 0
|
|
1442
|
+
metrics['tts_error'] = res['error']
|
|
1443
|
+
yield _tts_error_event(res['error'])
|
|
1444
|
+
tts_ok = False
|
|
1445
|
+
break
|
|
1446
|
+
if res['audio']:
|
|
1447
|
+
yield json.dumps({
|
|
1448
|
+
'type': 'audio',
|
|
1449
|
+
'audio': res['audio'],
|
|
1450
|
+
'audio_format': _audio_fmt,
|
|
1451
|
+
'chunk': _chunks_sent + i,
|
|
1452
|
+
'total_chunks': total_chunks,
|
|
1453
|
+
'timing': {
|
|
1454
|
+
'tts_ms': int((time.time() - t_tts_start) * 1000),
|
|
1455
|
+
'total_ms': int((time.time() - t_request_start) * 1000),
|
|
1456
|
+
},
|
|
1457
|
+
}) + '\n'
|
|
1458
|
+
|
|
1459
|
+
metrics['tts_generation_ms'] = int((time.time() - t_tts_start) * 1000)
|
|
1460
|
+
metrics['tts_text_len'] = metrics['response_len']
|
|
1461
|
+
metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
|
|
1462
|
+
log_metrics(metrics)
|
|
1463
|
+
if full_response:
|
|
1464
|
+
log_conversation('assistant', full_response,
|
|
1465
|
+
session_id=session_id,
|
|
1466
|
+
tts_provider=tts_provider, voice=voice)
|
|
1467
|
+
save_conversation_turn(
|
|
1468
|
+
user_msg=user_message,
|
|
1469
|
+
ai_response=full_response,
|
|
1470
|
+
session_id=session_id,
|
|
1471
|
+
session_key=_session_key,
|
|
1472
|
+
tts_provider=tts_provider,
|
|
1473
|
+
voice=voice,
|
|
1474
|
+
duration_ms=metrics.get('total_ms'),
|
|
1475
|
+
actions=captured_actions,
|
|
1476
|
+
identified_person=identified_person,
|
|
1477
|
+
)
|
|
1478
|
+
break
|
|
1479
|
+
|
|
1480
|
+
if evt['type'] == 'error':
|
|
1481
|
+
yield json.dumps({
|
|
1482
|
+
'type': 'error',
|
|
1483
|
+
'error': evt.get('error', 'Unknown error')
|
|
1484
|
+
}) + '\n'
|
|
1485
|
+
break
|
|
1486
|
+
|
|
1487
|
+
# Drain any unprocessed events (debug: detect generator exit without text_done)
|
|
1488
|
+
_remaining_evts = []
|
|
1489
|
+
while not event_queue.empty():
|
|
1490
|
+
try:
|
|
1491
|
+
_remaining_evts.append(event_queue.get_nowait())
|
|
1492
|
+
except Exception:
|
|
1493
|
+
break
|
|
1494
|
+
if _remaining_evts:
|
|
1495
|
+
_types = [e.get('type', '?') for e in _remaining_evts]
|
|
1496
|
+
logger.warning(f"### STREAM EXIT with {len(_remaining_evts)} unprocessed events: {_types}")
|
|
1497
|
+
|
|
1498
|
+
return Response(
|
|
1499
|
+
stream_response(),
|
|
1500
|
+
mimetype='application/x-ndjson',
|
|
1501
|
+
headers={'X-Accel-Buffering': 'no', 'Cache-Control': 'no-cache'}
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
else:
|
|
1505
|
+
# ── NON-STREAMING: wait for full Gateway response ─────────
|
|
1506
|
+
gw_thread.join(timeout=310)
|
|
1507
|
+
while not event_queue.empty():
|
|
1508
|
+
evt = event_queue.get_nowait()
|
|
1509
|
+
if evt['type'] == 'text_done':
|
|
1510
|
+
ai_response = evt.get('response')
|
|
1511
|
+
elif evt['type'] == 'handshake':
|
|
1512
|
+
metrics['handshake_ms'] = evt['ms']
|
|
1513
|
+
metrics['llm_inference_ms'] = int((time.time() - t_llm_start) * 1000)
|
|
1514
|
+
metrics['tool_count'] = sum(
|
|
1515
|
+
1 for a in captured_actions
|
|
1516
|
+
if a.get('type') == 'tool' and a.get('phase') == 'start'
|
|
1517
|
+
)
|
|
1518
|
+
metrics['profile'] = 'gateway'
|
|
1519
|
+
metrics['model'] = 'glm-4.7-flash'
|
|
1520
|
+
logger.info(
|
|
1521
|
+
f"### LLM inference completed in {metrics['llm_inference_ms']}ms "
|
|
1522
|
+
f"(tools={metrics['tool_count']})"
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
except Exception as e:
|
|
1526
|
+
logger.error(f'Failed to call Clawdbot Gateway: {e}')
|
|
1527
|
+
|
|
1528
|
+
# ── FALLBACK: Z.AI direct (glm-4.5-flash, no tools) ──────────────────
|
|
1529
|
+
if not ai_response:
|
|
1530
|
+
if metrics.get('profile') == 'gateway':
|
|
1531
|
+
logger.warning('No text response from Gateway, falling back to Z.AI flash...')
|
|
1532
|
+
metrics['fallback_used'] = 1
|
|
1533
|
+
else:
|
|
1534
|
+
logger.info('Using Z.AI flash direct (primary path)')
|
|
1535
|
+
t_flash_start = time.time()
|
|
1536
|
+
# Lazy import to avoid circular dependency (server.py imports this blueprint)
|
|
1537
|
+
try:
|
|
1538
|
+
import server as _server
|
|
1539
|
+
ai_response = _server.get_zai_direct_response(message_with_context, session_id)
|
|
1540
|
+
except Exception as e:
|
|
1541
|
+
logger.error(f'Z.AI direct call failed: {e}')
|
|
1542
|
+
ai_response = None
|
|
1543
|
+
metrics['profile'] = 'flash-direct'
|
|
1544
|
+
metrics['model'] = 'glm-4.5-flash'
|
|
1545
|
+
metrics['llm_inference_ms'] = int((time.time() - t_flash_start) * 1000)
|
|
1546
|
+
|
|
1547
|
+
# ── LAST RESORT ───────────────────────────────────────────────────────
|
|
1548
|
+
if not ai_response:
|
|
1549
|
+
logger.warning('Both Gateway and Z.AI flash failed, using generic fallback')
|
|
1550
|
+
ai_response = "One moment, I'm still working on something."
|
|
1551
|
+
|
|
1552
|
+
# Clean text for TTS
|
|
1553
|
+
tts_text = clean_for_tts(ai_response)
|
|
1554
|
+
logger.info(f'Cleaned TTS text ({len(tts_text)} chars): {tts_text[:100]}...')
|
|
1555
|
+
metrics['response_len'] = len(ai_response) if ai_response else 0
|
|
1556
|
+
metrics['tts_text_len'] = len(tts_text)
|
|
1557
|
+
|
|
1558
|
+
# Generate TTS audio
|
|
1559
|
+
t_tts_start = time.time()
|
|
1560
|
+
audio_base64 = None
|
|
1561
|
+
if tts_text and tts_text.strip():
|
|
1562
|
+
audio_base64 = _tts_generate_b64(tts_text, voice=voice or 'M1',
|
|
1563
|
+
tts_provider=tts_provider)
|
|
1564
|
+
if audio_base64 is None:
|
|
1565
|
+
metrics['tts_success'] = 0
|
|
1566
|
+
metrics['tts_error'] = 'TTS generation failed'
|
|
1567
|
+
t_tts_end = time.time()
|
|
1568
|
+
metrics['tts_generation_ms'] = int((t_tts_end - t_tts_start) * 1000)
|
|
1569
|
+
metrics['total_ms'] = int((t_tts_end - t_request_start) * 1000)
|
|
1570
|
+
|
|
1571
|
+
log_metrics(metrics)
|
|
1572
|
+
if ai_response:
|
|
1573
|
+
log_conversation('assistant', ai_response, session_id=session_id,
|
|
1574
|
+
tts_provider=tts_provider, voice=voice)
|
|
1575
|
+
save_conversation_turn(
|
|
1576
|
+
user_msg=user_message,
|
|
1577
|
+
ai_response=ai_response,
|
|
1578
|
+
session_id=session_id,
|
|
1579
|
+
session_key=get_voice_session_key(),
|
|
1580
|
+
tts_provider=tts_provider,
|
|
1581
|
+
voice=voice,
|
|
1582
|
+
duration_ms=metrics.get('total_ms'),
|
|
1583
|
+
actions=captured_actions,
|
|
1584
|
+
identified_person=identified_person,
|
|
1585
|
+
)
|
|
1586
|
+
|
|
1587
|
+
response_data = {'response': ai_response, 'user_said': user_message}
|
|
1588
|
+
if audio_base64:
|
|
1589
|
+
response_data['audio'] = audio_base64
|
|
1590
|
+
if captured_actions:
|
|
1591
|
+
response_data['actions'] = captured_actions
|
|
1592
|
+
response_data['timing'] = {
|
|
1593
|
+
'handshake_ms': metrics.get('handshake_ms'),
|
|
1594
|
+
'llm_ms': metrics.get('llm_inference_ms'),
|
|
1595
|
+
'tts_ms': metrics.get('tts_generation_ms'),
|
|
1596
|
+
'total_ms': metrics.get('total_ms'),
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
return jsonify(response_data)
|
|
1600
|
+
|
|
1601
|
+
# ---------------------------------------------------------------------------
|
|
1602
|
+
# POST /api/conversation/abort
|
|
1603
|
+
# ---------------------------------------------------------------------------
|
|
1604
|
+
|
|
1605
|
+
|
|
1606
|
+
@conversation_bp.route('/api/conversation/abort', methods=['POST'])
|
|
1607
|
+
def conversation_abort():
|
|
1608
|
+
"""Abort the active agent run for the current voice session.
|
|
1609
|
+
|
|
1610
|
+
Fire-and-forget from client — used by PTT interrupt and sendMessage
|
|
1611
|
+
interrupt to tell openclaw to stop generating so it doesn't waste compute.
|
|
1612
|
+
"""
|
|
1613
|
+
session_key = get_voice_session_key()
|
|
1614
|
+
# Log abort source from client for debugging
|
|
1615
|
+
source = 'unknown'
|
|
1616
|
+
source_text = ''
|
|
1617
|
+
try:
|
|
1618
|
+
body = request.get_json(silent=True) or {}
|
|
1619
|
+
source = body.get('source', 'unknown')
|
|
1620
|
+
source_text = body.get('text', '')
|
|
1621
|
+
except Exception:
|
|
1622
|
+
pass
|
|
1623
|
+
gw = gateway_manager.get('openclaw')
|
|
1624
|
+
aborted = False
|
|
1625
|
+
if gw and hasattr(gw, 'abort_active_run'):
|
|
1626
|
+
aborted = gw.abort_active_run(session_key)
|
|
1627
|
+
logger.info(f"### ABORT request session={session_key} aborted={aborted} source={source} text={source_text!r}")
|
|
1628
|
+
return jsonify({'ok': True, 'aborted': aborted})
|
|
1629
|
+
|
|
1630
|
+
|
|
1631
|
+
# ---------------------------------------------------------------------------
|
|
1632
|
+
# POST /api/conversation/steer
|
|
1633
|
+
# ---------------------------------------------------------------------------
|
|
1634
|
+
|
|
1635
|
+
|
|
1636
|
+
@conversation_bp.route('/api/conversation/steer', methods=['POST'])
|
|
1637
|
+
def conversation_steer():
|
|
1638
|
+
"""Inject a user message into the active agent run (steer mode).
|
|
1639
|
+
|
|
1640
|
+
Fire-and-forget from client — used when the user speaks while the
|
|
1641
|
+
agent is silently working (tools / sub-agents / heartbeat). Instead
|
|
1642
|
+
of aborting the active run and starting fresh, this sends a second
|
|
1643
|
+
chat.send to the same session. OpenClaw's messages.queue.mode=steer
|
|
1644
|
+
injects the message at the next tool boundary so the agent sees the
|
|
1645
|
+
user's correction and pivots immediately.
|
|
1646
|
+
|
|
1647
|
+
The active /api/conversation streaming response continues receiving
|
|
1648
|
+
the steered output — no new streaming connection is needed.
|
|
1649
|
+
|
|
1650
|
+
Request body:
|
|
1651
|
+
message (str) — the user's text to inject
|
|
1652
|
+
source (str) — label for logging (e.g. 'clawdbot-sendMessage')
|
|
1653
|
+
|
|
1654
|
+
Returns:
|
|
1655
|
+
{ ok: true, steered: true/false }
|
|
1656
|
+
"""
|
|
1657
|
+
body = request.get_json(silent=True) or {}
|
|
1658
|
+
message = (body.get('message') or '').strip()
|
|
1659
|
+
source = body.get('source', 'unknown')
|
|
1660
|
+
|
|
1661
|
+
if not message:
|
|
1662
|
+
return jsonify({'ok': False, 'error': 'No message provided'}), 400
|
|
1663
|
+
|
|
1664
|
+
# Input length guard (same as main conversation endpoint)
|
|
1665
|
+
if len(message) > 4000:
|
|
1666
|
+
return jsonify({'ok': False, 'error': 'Message too long'}), 400
|
|
1667
|
+
|
|
1668
|
+
session_key = get_voice_session_key()
|
|
1669
|
+
|
|
1670
|
+
steered = gateway_manager.send_steer(message, session_key)
|
|
1671
|
+
|
|
1672
|
+
logger.info(
|
|
1673
|
+
f"### STEER request session={session_key} steered={steered} "
|
|
1674
|
+
f"source={source} text={message!r}"
|
|
1675
|
+
)
|
|
1676
|
+
|
|
1677
|
+
# Log the steer message as a user turn so the transcript is preserved
|
|
1678
|
+
log_conversation('user', message, session_id='default')
|
|
1679
|
+
|
|
1680
|
+
return jsonify({'ok': True, 'steered': steered})
|
|
1681
|
+
|
|
1682
|
+
|
|
1683
|
+
# ---------------------------------------------------------------------------
|
|
1684
|
+
# POST /api/conversation/reset
|
|
1685
|
+
# ---------------------------------------------------------------------------
|
|
1686
|
+
|
|
1687
|
+
|
|
1688
|
+
@conversation_bp.route('/api/conversation/reset', methods=['POST'])
|
|
1689
|
+
def conversation_reset():
|
|
1690
|
+
"""Clear in-process conversation history for a session."""
|
|
1691
|
+
body = request.get_json() or {}
|
|
1692
|
+
session_id = body.get('session_id', 'default')
|
|
1693
|
+
conversation_histories.pop(session_id, None)
|
|
1694
|
+
return jsonify({'status': 'ok', 'message': 'Conversation history cleared'})
|
|
1695
|
+
|
|
1696
|
+
|
|
1697
|
+
# ---------------------------------------------------------------------------
|
|
1698
|
+
# POST /api/session/reset — manual session reset from UI actions panel
|
|
1699
|
+
# ---------------------------------------------------------------------------
|
|
1700
|
+
|
|
1701
|
+
@conversation_bp.route('/api/session/reset', methods=['POST'])
|
|
1702
|
+
def session_reset():
|
|
1703
|
+
"""Clear the corrupted openclaw session state and return a fresh session key.
|
|
1704
|
+
Called by the Reset button in the UI actions panel.
|
|
1705
|
+
Clears the openclaw session JSONL file so orphaned messages don't cascade,
|
|
1706
|
+
then bumps the voice session key so the next request starts completely fresh."""
|
|
1707
|
+
old_key = get_voice_session_key()
|
|
1708
|
+
# Find and clear the openclaw session file for the current session key
|
|
1709
|
+
try:
|
|
1710
|
+
sessions_dir = Path('/home/node/.openclaw/agents/openvoiceui/sessions')
|
|
1711
|
+
sessions_json = sessions_dir / 'sessions.json'
|
|
1712
|
+
if sessions_json.exists():
|
|
1713
|
+
import json as _json
|
|
1714
|
+
sessions_map = _json.loads(sessions_json.read_text())
|
|
1715
|
+
# The openclaw session key format is "agent:openvoiceui:<voice_key>"
|
|
1716
|
+
oclaw_key = f'agent:openvoiceui:{old_key}'
|
|
1717
|
+
session_info = sessions_map.get(oclaw_key, {})
|
|
1718
|
+
session_id = session_info.get('sessionId')
|
|
1719
|
+
if session_id:
|
|
1720
|
+
session_file = sessions_dir / f'{session_id}.jsonl'
|
|
1721
|
+
if session_file.exists():
|
|
1722
|
+
_ts = __import__('datetime').datetime.utcnow().isoformat() + 'Z'
|
|
1723
|
+
session_file.write_text('{"type":"session","version":3,"id":"' + session_id + '","timestamp":"' + _ts + '","cwd":"/home/node/.openclaw/workspace"}\n')
|
|
1724
|
+
logger.info(f'### SESSION RESET: cleared openclaw session file {session_id}.jsonl')
|
|
1725
|
+
except Exception as e:
|
|
1726
|
+
logger.warning(f'### SESSION RESET: could not clear openclaw session file: {e}')
|
|
1727
|
+
new_key = bump_voice_session()
|
|
1728
|
+
return jsonify({'status': 'ok', 'old': old_key, 'new': new_key})
|
|
1729
|
+
|
|
1730
|
+
|
|
1731
|
+
# ---------------------------------------------------------------------------
|
|
1732
|
+
# GET /api/tts/providers
|
|
1733
|
+
# ---------------------------------------------------------------------------
|
|
1734
|
+
|
|
1735
|
+
|
|
1736
|
+
@conversation_bp.route('/api/tts/providers', methods=['GET'])
|
|
1737
|
+
def tts_providers_list():
|
|
1738
|
+
"""List all available TTS providers with metadata."""
|
|
1739
|
+
try:
|
|
1740
|
+
providers = list_providers(include_inactive=True)
|
|
1741
|
+
config_path = (Path(__file__).parent.parent
|
|
1742
|
+
/ 'tts_providers' / 'providers_config.json')
|
|
1743
|
+
default_provider = 'supertonic'
|
|
1744
|
+
try:
|
|
1745
|
+
with open(config_path, 'r') as f:
|
|
1746
|
+
config = json.load(f)
|
|
1747
|
+
default_provider = config.get('default_provider', 'supertonic')
|
|
1748
|
+
except Exception:
|
|
1749
|
+
pass
|
|
1750
|
+
return jsonify({'providers': providers, 'default_provider': default_provider})
|
|
1751
|
+
except Exception as e:
|
|
1752
|
+
logger.error(f'Failed to list TTS providers: {e}')
|
|
1753
|
+
return jsonify({'error': f'Failed to list providers: {e}'}), 500
|
|
1754
|
+
|
|
1755
|
+
# ---------------------------------------------------------------------------
|
|
1756
|
+
# POST /api/tts/generate
|
|
1757
|
+
# ---------------------------------------------------------------------------
|
|
1758
|
+
|
|
1759
|
+
|
|
1760
|
+
@conversation_bp.route('/api/tts/generate', methods=['POST'])
|
|
1761
|
+
def tts_generate():
|
|
1762
|
+
"""
|
|
1763
|
+
Generate speech from text using the specified TTS provider.
|
|
1764
|
+
|
|
1765
|
+
Request JSON:
|
|
1766
|
+
text : str — text to synthesize (required)
|
|
1767
|
+
provider : str — provider ID (default: supertonic)
|
|
1768
|
+
voice : str — voice ID (default: provider default)
|
|
1769
|
+
lang : str — language code (default: en)
|
|
1770
|
+
speed : float — speech speed (default: provider default)
|
|
1771
|
+
options : dict — provider-specific options
|
|
1772
|
+
Returns: WAV audio file
|
|
1773
|
+
"""
|
|
1774
|
+
try:
|
|
1775
|
+
data = request.get_json()
|
|
1776
|
+
if not data:
|
|
1777
|
+
return jsonify({'error': 'No JSON data provided'}), 400
|
|
1778
|
+
|
|
1779
|
+
text = data.get('text', '').strip()
|
|
1780
|
+
if not text:
|
|
1781
|
+
return jsonify({'error': 'Text cannot be empty'}), 400
|
|
1782
|
+
|
|
1783
|
+
# Length guard (P7-T3 security audit)
|
|
1784
|
+
if len(text) > 2000:
|
|
1785
|
+
return jsonify({'error': 'Text too long (max 2000 characters)'}), 400
|
|
1786
|
+
|
|
1787
|
+
provider_id = data.get('provider', 'supertonic')
|
|
1788
|
+
voice = data.get('voice', None)
|
|
1789
|
+
lang = data.get('lang', 'en')
|
|
1790
|
+
speed = data.get('speed', None)
|
|
1791
|
+
options = data.get('options', {})
|
|
1792
|
+
|
|
1793
|
+
valid_langs = ['en', 'ko', 'es', 'pt', 'fr', 'zh', 'ja', 'de']
|
|
1794
|
+
if lang and lang.lower() not in valid_langs:
|
|
1795
|
+
return jsonify({
|
|
1796
|
+
'error': f"Invalid language: {lang}. Supported: {', '.join(valid_langs)}"
|
|
1797
|
+
}), 400
|
|
1798
|
+
|
|
1799
|
+
if speed is not None:
|
|
1800
|
+
try:
|
|
1801
|
+
speed = float(speed)
|
|
1802
|
+
if speed < 0.25 or speed > 4.0:
|
|
1803
|
+
return jsonify({'error': 'Speed must be between 0.25 and 4.0'}), 400
|
|
1804
|
+
except (ValueError, TypeError):
|
|
1805
|
+
return jsonify({'error': 'Speed must be a valid number'}), 400
|
|
1806
|
+
|
|
1807
|
+
try:
|
|
1808
|
+
provider = get_provider(provider_id)
|
|
1809
|
+
except ValueError as e:
|
|
1810
|
+
available = ', '.join([p['provider_id'] for p in list_providers()])
|
|
1811
|
+
return jsonify({'error': 'Invalid TTS provider', 'available_providers': available}), 400
|
|
1812
|
+
|
|
1813
|
+
logger.info(
|
|
1814
|
+
f"TTS request: provider={provider_id}, text='{text[:50]}...', "
|
|
1815
|
+
f"voice={voice}, lang={lang}, speed={speed}"
|
|
1816
|
+
)
|
|
1817
|
+
|
|
1818
|
+
gen_params = {'text': text}
|
|
1819
|
+
if voice is not None:
|
|
1820
|
+
gen_params['voice'] = voice
|
|
1821
|
+
if lang is not None:
|
|
1822
|
+
gen_params['lang'] = lang
|
|
1823
|
+
if speed is not None:
|
|
1824
|
+
gen_params['speed'] = speed
|
|
1825
|
+
gen_params.update(options)
|
|
1826
|
+
|
|
1827
|
+
try:
|
|
1828
|
+
audio_bytes = provider.generate_speech(**gen_params)
|
|
1829
|
+
except ValueError as e:
|
|
1830
|
+
return jsonify({'error': f'Invalid parameter: {e}'}), 400
|
|
1831
|
+
except Exception as e:
|
|
1832
|
+
logger.error(f'Speech generation failed for {provider_id}: {e}')
|
|
1833
|
+
return jsonify({'error': f'Speech generation failed: {e}'}), 500
|
|
1834
|
+
|
|
1835
|
+
provider_format = provider.get_info().get('audio_format', 'wav')
|
|
1836
|
+
mime_type = 'audio/mpeg' if provider_format == 'mp3' else 'audio/wav'
|
|
1837
|
+
response = make_response(audio_bytes)
|
|
1838
|
+
response.headers['Content-Type'] = mime_type
|
|
1839
|
+
response.headers['Content-Length'] = len(audio_bytes)
|
|
1840
|
+
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
|
1841
|
+
response.headers['X-TTS-Provider'] = provider_id
|
|
1842
|
+
if voice:
|
|
1843
|
+
response.headers['X-TTS-Voice'] = voice
|
|
1844
|
+
return response
|
|
1845
|
+
|
|
1846
|
+
except ValueError as e:
|
|
1847
|
+
return jsonify({'error': f'Invalid input: {e}'}), 400
|
|
1848
|
+
except Exception as e:
|
|
1849
|
+
import traceback
|
|
1850
|
+
logger.error(f'TTS generate endpoint error: {e}')
|
|
1851
|
+
logger.error(traceback.format_exc())
|
|
1852
|
+
return jsonify({'error': 'Internal server error'}), 500
|
|
1853
|
+
|
|
1854
|
+
# ---------------------------------------------------------------------------
|
|
1855
|
+
# POST /api/tts/clone — Clone a voice from audio
|
|
1856
|
+
# ---------------------------------------------------------------------------
|
|
1857
|
+
|
|
1858
|
+
|
|
1859
|
+
@conversation_bp.route('/api/tts/clone', methods=['POST'])
|
|
1860
|
+
def tts_clone_voice():
|
|
1861
|
+
"""
|
|
1862
|
+
Clone a voice from an audio sample.
|
|
1863
|
+
|
|
1864
|
+
Accepts either:
|
|
1865
|
+
- JSON: {"audio_url": "...", "name": "...", "reference_text": "..."}
|
|
1866
|
+
- Multipart form: audio file + name field
|
|
1867
|
+
|
|
1868
|
+
Returns: JSON with voice_id, name, embedding metadata.
|
|
1869
|
+
"""
|
|
1870
|
+
try:
|
|
1871
|
+
provider = get_provider('qwen3')
|
|
1872
|
+
if not provider.is_available():
|
|
1873
|
+
return jsonify({'error': 'Qwen3 provider not available (FAL_KEY not set)'}), 503
|
|
1874
|
+
|
|
1875
|
+
# JSON mode (audio already hosted at a URL)
|
|
1876
|
+
if request.is_json:
|
|
1877
|
+
data = request.get_json()
|
|
1878
|
+
audio_url = data.get('audio_url', '').strip()
|
|
1879
|
+
name = data.get('name', '').strip()
|
|
1880
|
+
reference_text = data.get('reference_text', '').strip() or None
|
|
1881
|
+
|
|
1882
|
+
if not audio_url:
|
|
1883
|
+
return jsonify({'error': 'audio_url is required'}), 400
|
|
1884
|
+
if not name:
|
|
1885
|
+
return jsonify({'error': 'name is required'}), 400
|
|
1886
|
+
|
|
1887
|
+
# Multipart form mode (upload audio file directly)
|
|
1888
|
+
elif 'audio' in request.files:
|
|
1889
|
+
from services.paths import UPLOADS_DIR
|
|
1890
|
+
import uuid
|
|
1891
|
+
|
|
1892
|
+
audio_file = request.files['audio']
|
|
1893
|
+
name = request.form.get('name', '').strip()
|
|
1894
|
+
reference_text = request.form.get('reference_text', '').strip() or None
|
|
1895
|
+
|
|
1896
|
+
if not name:
|
|
1897
|
+
return jsonify({'error': 'name field is required'}), 400
|
|
1898
|
+
if not audio_file.filename:
|
|
1899
|
+
return jsonify({'error': 'Empty audio file'}), 400
|
|
1900
|
+
|
|
1901
|
+
# Save upload
|
|
1902
|
+
ext = Path(audio_file.filename).suffix.lower()
|
|
1903
|
+
if ext not in ('.wav', '.mp3', '.m4a', '.ogg', '.webm', '.flac'):
|
|
1904
|
+
return jsonify({'error': f'Unsupported audio format: {ext}'}), 400
|
|
1905
|
+
|
|
1906
|
+
safe_name = f"voice_clone_{uuid.uuid4().hex[:12]}{ext}"
|
|
1907
|
+
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
|
|
1908
|
+
save_path = UPLOADS_DIR / safe_name
|
|
1909
|
+
audio_file.save(str(save_path))
|
|
1910
|
+
|
|
1911
|
+
# Build public URL for fal.ai to fetch
|
|
1912
|
+
audio_url = f"{request.host_url.rstrip('/')}/uploads/{safe_name}"
|
|
1913
|
+
else:
|
|
1914
|
+
return jsonify({
|
|
1915
|
+
'error': 'Send JSON with audio_url or multipart form with audio file'
|
|
1916
|
+
}), 400
|
|
1917
|
+
|
|
1918
|
+
logger.info(f"Voice clone request: name='{name}', url={audio_url[:80]}")
|
|
1919
|
+
result = provider.clone_voice(
|
|
1920
|
+
audio_url=audio_url,
|
|
1921
|
+
name=name,
|
|
1922
|
+
reference_text=reference_text,
|
|
1923
|
+
)
|
|
1924
|
+
|
|
1925
|
+
return jsonify({
|
|
1926
|
+
'status': 'ok',
|
|
1927
|
+
'voice_id': result['voice_id'],
|
|
1928
|
+
'name': result['name'],
|
|
1929
|
+
'created_at': result['created_at'],
|
|
1930
|
+
'clone_time_ms': result['clone_time_ms'],
|
|
1931
|
+
'embedding_size': result['embedding_size'],
|
|
1932
|
+
'usage': (
|
|
1933
|
+
f'Use voice_id "{result["voice_id"]}" in /api/tts/generate '
|
|
1934
|
+
f'with provider=qwen3'
|
|
1935
|
+
),
|
|
1936
|
+
})
|
|
1937
|
+
|
|
1938
|
+
except RuntimeError as e:
|
|
1939
|
+
logger.error(f"Voice clone failed: {e}")
|
|
1940
|
+
return jsonify({'error': str(e)}), 500
|
|
1941
|
+
except Exception as e:
|
|
1942
|
+
import traceback
|
|
1943
|
+
logger.error(f"Voice clone error: {e}")
|
|
1944
|
+
logger.error(traceback.format_exc())
|
|
1945
|
+
return jsonify({'error': 'Internal server error'}), 500
|
|
1946
|
+
|
|
1947
|
+
|
|
1948
|
+
# ---------------------------------------------------------------------------
|
|
1949
|
+
# GET /api/tts/voices — List all voices (built-in + cloned) across providers
|
|
1950
|
+
# ---------------------------------------------------------------------------
|
|
1951
|
+
|
|
1952
|
+
|
|
1953
|
+
@conversation_bp.route('/api/tts/voices', methods=['GET'])
|
|
1954
|
+
def tts_voices_list():
|
|
1955
|
+
"""List all available voices across all providers, including cloned voices."""
|
|
1956
|
+
try:
|
|
1957
|
+
all_voices = {}
|
|
1958
|
+
for provider_info in list_providers(include_inactive=False):
|
|
1959
|
+
pid = provider_info.get('provider_id', provider_info.get('name', 'unknown'))
|
|
1960
|
+
voices = provider_info.get('voices', [])
|
|
1961
|
+
cloned = provider_info.get('cloned_voices', [])
|
|
1962
|
+
all_voices[pid] = {
|
|
1963
|
+
'builtin': voices,
|
|
1964
|
+
'cloned': cloned,
|
|
1965
|
+
}
|
|
1966
|
+
return jsonify({'voices': all_voices})
|
|
1967
|
+
except Exception as e:
|
|
1968
|
+
logger.error(f"Failed to list voices: {e}")
|
|
1969
|
+
return jsonify({'error': str(e)}), 500
|
|
1970
|
+
|
|
1971
|
+
|
|
1972
|
+
# ---------------------------------------------------------------------------
|
|
1973
|
+
# DELETE /api/tts/voices/<voice_id> — Retire a cloned voice
|
|
1974
|
+
# ---------------------------------------------------------------------------
|
|
1975
|
+
|
|
1976
|
+
|
|
1977
|
+
@conversation_bp.route('/api/tts/voices/<voice_id>', methods=['DELETE'])
|
|
1978
|
+
def tts_delete_voice(voice_id):
|
|
1979
|
+
"""Retire a cloned voice embedding (renamed, not deleted)."""
|
|
1980
|
+
try:
|
|
1981
|
+
if not voice_id.startswith('clone_'):
|
|
1982
|
+
return jsonify({'error': 'Can only retire cloned voices (clone_*)'}), 400
|
|
1983
|
+
|
|
1984
|
+
from services.paths import VOICE_CLONES_DIR
|
|
1985
|
+
voice_dir = VOICE_CLONES_DIR / voice_id
|
|
1986
|
+
|
|
1987
|
+
# Validate path doesn't escape
|
|
1988
|
+
try:
|
|
1989
|
+
voice_dir.resolve().relative_to(VOICE_CLONES_DIR.resolve())
|
|
1990
|
+
except ValueError:
|
|
1991
|
+
return jsonify({'error': 'Invalid voice_id'}), 400
|
|
1992
|
+
|
|
1993
|
+
if not voice_dir.exists():
|
|
1994
|
+
return jsonify({'error': f'Voice {voice_id} not found'}), 404
|
|
1995
|
+
|
|
1996
|
+
# Rename to .retired instead of removing (NEVER DELETE rule)
|
|
1997
|
+
renamed = voice_dir.with_name(voice_dir.name + '.retired')
|
|
1998
|
+
voice_dir.rename(renamed)
|
|
1999
|
+
logger.info(f"Cloned voice retired: {voice_id}")
|
|
2000
|
+
|
|
2001
|
+
return jsonify({'status': 'ok', 'voice_id': voice_id, 'action': 'retired'})
|
|
2002
|
+
except Exception as e:
|
|
2003
|
+
logger.error(f"Failed to retire voice {voice_id}: {e}")
|
|
2004
|
+
return jsonify({'error': str(e)}), 500
|
|
2005
|
+
|
|
2006
|
+
|
|
2007
|
+
# ---------------------------------------------------------------------------
|
|
2008
|
+
# POST /api/supertonic-tts (DEPRECATED — use /api/tts/generate)
|
|
2009
|
+
# ---------------------------------------------------------------------------
|
|
2010
|
+
|
|
2011
|
+
|
|
2012
|
+
@conversation_bp.route('/api/supertonic-tts', methods=['POST'])
|
|
2013
|
+
def supertonic_tts_endpoint():
|
|
2014
|
+
"""
|
|
2015
|
+
Generate speech via Supertonic TTS (deprecated — prefer /api/tts/generate).
|
|
2016
|
+
|
|
2017
|
+
Request JSON: text, lang, speed, voice_style
|
|
2018
|
+
Returns: WAV audio
|
|
2019
|
+
"""
|
|
2020
|
+
try:
|
|
2021
|
+
data = request.get_json()
|
|
2022
|
+
if not data:
|
|
2023
|
+
return jsonify({'error': 'No JSON data provided'}), 400
|
|
2024
|
+
|
|
2025
|
+
text = data.get('text', '').strip()
|
|
2026
|
+
if not text:
|
|
2027
|
+
return jsonify({'error': 'Text cannot be empty'}), 400
|
|
2028
|
+
|
|
2029
|
+
lang = data.get('lang', 'en').lower()
|
|
2030
|
+
if lang not in ['en', 'ko', 'es', 'pt', 'fr']:
|
|
2031
|
+
return jsonify({
|
|
2032
|
+
'error': f"Invalid language: {lang}. Supported: en, ko, es, pt, fr"
|
|
2033
|
+
}), 400
|
|
2034
|
+
|
|
2035
|
+
speed = float(data.get('speed', 1.0))
|
|
2036
|
+
if speed < 0.5 or speed > 2.0:
|
|
2037
|
+
return jsonify({'error': 'Speed must be between 0.5 and 2.0'}), 400
|
|
2038
|
+
|
|
2039
|
+
voice_style = data.get('voice_style', 'M1').upper()
|
|
2040
|
+
valid_voices = ['M1', 'M2', 'M3', 'M4', 'M5', 'F1', 'F2', 'F3', 'F4', 'F5']
|
|
2041
|
+
if voice_style not in valid_voices:
|
|
2042
|
+
return jsonify({
|
|
2043
|
+
'error': f"Invalid voice: {voice_style}. "
|
|
2044
|
+
f"Available: {', '.join(valid_voices)}"
|
|
2045
|
+
}), 400
|
|
2046
|
+
|
|
2047
|
+
logger.info(f"Generating speech: {text[:50]}... (lang={lang}, speed={speed})")
|
|
2048
|
+
|
|
2049
|
+
try:
|
|
2050
|
+
tts_instance = get_supertonic_for_voice(voice_style)
|
|
2051
|
+
except Exception as e:
|
|
2052
|
+
logger.error(f'Failed to initialize TTS with voice {voice_style}: {e}')
|
|
2053
|
+
return jsonify({'error': f'Failed to load voice style: {e}'}), 500
|
|
2054
|
+
|
|
2055
|
+
try:
|
|
2056
|
+
audio_bytes = tts_instance.generate_speech(
|
|
2057
|
+
text=text, lang=lang, speed=speed, total_step=16
|
|
2058
|
+
)
|
|
2059
|
+
except Exception as e:
|
|
2060
|
+
logger.error(f'Speech synthesis failed: {e}')
|
|
2061
|
+
return jsonify({'error': f'Speech synthesis failed: {e}'}), 500
|
|
2062
|
+
|
|
2063
|
+
response = make_response(audio_bytes)
|
|
2064
|
+
response.headers['Content-Type'] = 'audio/wav'
|
|
2065
|
+
response.headers['Content-Length'] = len(audio_bytes)
|
|
2066
|
+
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
|
2067
|
+
return response
|
|
2068
|
+
|
|
2069
|
+
except ValueError as e:
|
|
2070
|
+
return jsonify({'error': f'Invalid input: {e}'}), 400
|
|
2071
|
+
except Exception as e:
|
|
2072
|
+
import traceback
|
|
2073
|
+
logger.error(f'TTS endpoint error: {e}')
|
|
2074
|
+
logger.error(traceback.format_exc())
|
|
2075
|
+
return jsonify({'error': 'Internal server error'}), 500
|
|
2076
|
+
|
|
2077
|
+
# ---------------------------------------------------------------------------
|
|
2078
|
+
# POST /api/tts/preview (P4-T5: TTS voice preview)
|
|
2079
|
+
# ---------------------------------------------------------------------------
|
|
2080
|
+
|
|
2081
|
+
_PREVIEW_TEXT = "Hello! This is a preview of the selected voice."
|
|
2082
|
+
|
|
2083
|
+
|
|
2084
|
+
@conversation_bp.route('/api/tts/preview', methods=['POST'])
|
|
2085
|
+
def tts_preview():
|
|
2086
|
+
"""
|
|
2087
|
+
Generate a short audio preview for a given TTS voice.
|
|
2088
|
+
|
|
2089
|
+
Request JSON (all optional):
|
|
2090
|
+
provider : str — TTS provider ID (default: 'supertonic')
|
|
2091
|
+
voice : str — Voice ID (default: provider default, e.g. 'M1')
|
|
2092
|
+
text : str — Custom preview text (max 200 chars; default sample phrase)
|
|
2093
|
+
|
|
2094
|
+
Returns JSON:
|
|
2095
|
+
audio_b64 : str — Base64-encoded WAV audio
|
|
2096
|
+
provider : str — Provider used
|
|
2097
|
+
voice : str — Voice used
|
|
2098
|
+
"""
|
|
2099
|
+
try:
|
|
2100
|
+
data = request.get_json(silent=True) or {}
|
|
2101
|
+
|
|
2102
|
+
provider_id = str(data.get('provider', 'supertonic')).strip()
|
|
2103
|
+
voice = data.get('voice', None)
|
|
2104
|
+
text = str(data.get('text', _PREVIEW_TEXT)).strip()[:200] or _PREVIEW_TEXT
|
|
2105
|
+
|
|
2106
|
+
# Validate provider exists
|
|
2107
|
+
try:
|
|
2108
|
+
get_provider(provider_id)
|
|
2109
|
+
except ValueError:
|
|
2110
|
+
available = ', '.join([p['provider_id'] for p in list_providers()])
|
|
2111
|
+
return jsonify({
|
|
2112
|
+
'error': f"Unknown provider: {provider_id}",
|
|
2113
|
+
'available_providers': available,
|
|
2114
|
+
}), 400
|
|
2115
|
+
|
|
2116
|
+
logger.info(f"TTS preview: provider={provider_id}, voice={voice}, text='{text[:40]}'")
|
|
2117
|
+
|
|
2118
|
+
audio_b64 = _tts_generate_b64(
|
|
2119
|
+
text=text,
|
|
2120
|
+
voice=voice,
|
|
2121
|
+
tts_provider=provider_id,
|
|
2122
|
+
)
|
|
2123
|
+
|
|
2124
|
+
if audio_b64 is None:
|
|
2125
|
+
return jsonify({'error': 'TTS generation failed — check server logs'}), 500
|
|
2126
|
+
|
|
2127
|
+
return jsonify({
|
|
2128
|
+
'audio_b64': audio_b64,
|
|
2129
|
+
'provider': provider_id,
|
|
2130
|
+
'voice': voice or 'default',
|
|
2131
|
+
})
|
|
2132
|
+
|
|
2133
|
+
except Exception as e:
|
|
2134
|
+
import traceback
|
|
2135
|
+
logger.error(f'TTS preview error: {e}')
|
|
2136
|
+
logger.error(traceback.format_exc())
|
|
2137
|
+
return jsonify({'error': 'Internal server error'}), 500
|
|
2138
|
+
|
|
2139
|
+
|
|
2140
|
+
@conversation_bp.route('/api/stt-events', methods=['POST'])
|
|
2141
|
+
def stt_events():
|
|
2142
|
+
"""Receive STT error/status events from the browser.
|
|
2143
|
+
Logs them in a format the session monitor can parse from container stdout.
|
|
2144
|
+
Only real errors are sent (no-speech and aborted are filtered client-side).
|
|
2145
|
+
"""
|
|
2146
|
+
try:
|
|
2147
|
+
data = request.get_json(silent=True) or {}
|
|
2148
|
+
error_code = data.get('error', 'unknown')
|
|
2149
|
+
message = data.get('message', '')
|
|
2150
|
+
provider = data.get('provider', 'webspeech')
|
|
2151
|
+
source = data.get('source', 'stt') # 'stt' or 'wake_word'
|
|
2152
|
+
|
|
2153
|
+
# Log in session-monitor-parseable format
|
|
2154
|
+
print(f"### STT_ERROR: {error_code} — {message} (provider={provider} source={source})",
|
|
2155
|
+
flush=True)
|
|
2156
|
+
return jsonify({'ok': True})
|
|
2157
|
+
except Exception:
|
|
2158
|
+
return jsonify({'ok': False}), 500
|