openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
@@ -0,0 +1,2158 @@
1
+ """
2
+ routes/conversation.py — Conversation & TTS Blueprint (P2-T3)
3
+
4
+ Extracted from server.py during Phase 2 blueprint split.
5
+ Registers routes:
6
+ POST /api/conversation (main voice conversation endpoint)
7
+ POST /api/conversation/reset (clear conversation history for a session)
8
+ GET /api/tts/providers (list available TTS providers)
9
+ POST /api/tts/generate (generate TTS audio from text)
10
+ POST /api/supertonic-tts (deprecated legacy TTS endpoint)
11
+
12
+ Also exports helpers used by other server.py code:
13
+ get_voice_session_key()
14
+ bump_voice_session()
15
+ conversation_histories (dict of session histories)
16
+ _consecutive_empty_responses (module global, accessed via this module)
17
+ clean_for_tts()
18
+ """
19
+
20
+ import base64
21
+ import json
22
+ import logging
23
+ import os
24
+ import queue
25
+ import re
26
+ import sqlite3
27
+ import threading
28
+ import time
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+
32
+ from flask import Blueprint, Response, jsonify, make_response, request
33
+
34
+ from routes.canvas import canvas_context, update_canvas_context, CANVAS_PAGES_DIR
35
+ from routes.transcripts import save_conversation_turn
36
+ from routes.music import current_music_state as _music_state
37
+ from services.gateway_manager import gateway_manager
38
+ from services.gateways.compat import is_system_response
39
+ from services.tts import generate_tts_b64 as _tts_generate_b64
40
+ from tts_providers import get_provider, list_providers
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Constants
46
+ # ---------------------------------------------------------------------------
47
+
48
+ from services.paths import DB_PATH, VOICE_SESSION_FILE
49
+
50
+ BRAIN_EVENTS_PATH = Path('/tmp/openvoiceui-events.jsonl')
51
+ MAX_HISTORY_MESSAGES = 20
52
+
53
+ # Vision keyword detection — triggers camera frame analysis via GLM-4V
54
+ _VISION_KEYWORDS = (
55
+ 'what do you see', 'what can you see', 'what are you seeing',
56
+ 'look at', 'what is in front', "what's in front",
57
+ 'describe what', 'tell me what you see', 'can you see',
58
+ 'what is that', "what's that", 'who is that', "who's that",
59
+ 'what am i holding', 'what am i wearing', 'what does it look like',
60
+ 'what am i showing', 'what is this', "what's this",
61
+ 'show me what you see', 'use the camera', 'check the camera',
62
+ 'look through the camera', 'do you see', 'you see this',
63
+ 'take a look', 'what color', 'read this', 'read that',
64
+ )
65
+ _VISION_FRAME_MAX_AGE = 10 # seconds — ignore frames older than this
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Voice assistant instructions — injected into every message context.
69
+ #
70
+ # PRIMARY SOURCE: prompts/voice-system-prompt.md (hot-reload, no restart needed)
71
+ # Editable via admin API: PUT /api/instructions/voice-system-prompt
72
+ #
73
+ # FALLBACK: _VOICE_INSTRUCTIONS constant below (used if file missing/unreadable)
74
+ # ---------------------------------------------------------------------------
75
+
76
+ _PROMPTS_DIR = Path(__file__).parent.parent / 'prompts'
77
+ _VOICE_PROMPT_FILE = _PROMPTS_DIR / 'voice-system-prompt.md'
78
+
79
+
80
+ def _load_voice_system_prompt() -> str:
81
+ """Load voice-system-prompt.md, stripping # comment lines. Hot-reloads every call.
82
+ Falls back to _VOICE_INSTRUCTIONS if the file is missing or unreadable."""
83
+ try:
84
+ raw = _VOICE_PROMPT_FILE.read_text(encoding='utf-8')
85
+ lines = [l for l in raw.splitlines() if not l.startswith('#')]
86
+ content = ' '.join(line.strip() for line in lines if line.strip())
87
+ if content:
88
+ return content
89
+ except Exception:
90
+ pass
91
+ return _VOICE_INSTRUCTIONS # fallback to hardcoded constant
92
+ _VOICE_INSTRUCTIONS = (
93
+ "[OPENVOICEUI SYSTEM INSTRUCTIONS: "
94
+
95
+ # --- Voice & Tone ---
96
+ "You are a voice AI assistant. ALWAYS respond in English — never Chinese or any other language. "
97
+ "Respond in natural, conversational tone — NO markdown (no #, -, *, bullet lists, or tables). "
98
+ "Be brief and direct. Never sound like a call center agent or a search engine. "
99
+ "BANNED OPENERS — never start a response with: 'Hey there', 'Great question', 'Absolutely', "
100
+ "'Of course', 'Certainly', 'Sure thing', 'I hear you', 'I understand you saying', "
101
+ "'That's a great', or any variation. Just answer. "
102
+ "Do NOT repeat or paraphrase what the user just said. Do NOT end every reply with a question. "
103
+
104
+ # --- Identity ---
105
+ "IDENTITY: Do NOT address anyone by name unless a [FACE RECOGNITION] tag appears in this "
106
+ "exact message confirming their identity. Different people use this interface. "
107
+ "Never use names from memory or prior sessions without face recognition in this message. "
108
+
109
+ # --- Critical tag rule ---
110
+ "CRITICAL — EVERY RESPONSE MUST CONTAIN SPOKEN WORDS alongside any action tags. "
111
+ "NEVER output a bare tag alone — the user hears silence and sees nothing. "
112
+ "BAD: [CANVAS:page-id] GOOD: Here's your dashboard. [CANVAS:page-id] "
113
+ "BAD: [MUSIC_PLAY] GOOD: Playing something for you now. [MUSIC_PLAY] "
114
+ "Tags are invisible to the user — they only hear your words. "
115
+
116
+ # --- Canvas: open existing page ---
117
+ "CANVAS TAGS: "
118
+ "[CANVAS:page-id] — opens a canvas page. Use exact page-id from the [Canvas pages:] list above. "
119
+ "When opening, briefly say what the page shows (1-2 sentences). "
120
+ "NEVER use the openclaw 'canvas' tool with action:'present' — it fails with 'node required'. "
121
+ "ONLY the [CANVAS:page-id] tag works to open pages. "
122
+ "Repeating [CANVAS:same-page] on an already-open page forces a refresh. "
123
+ "[CANVAS_MENU] — opens the page picker so the user can browse all pages. "
124
+ "[CANVAS_URL:https://example.com] — loads an external URL in the canvas iframe "
125
+ "(only sites that allow iframe embedding). "
126
+
127
+ # --- Canvas: create a new page ---
128
+ "CREATING A NEW CANVAS PAGE: "
129
+ "Step 1 — write the HTML file: write({path:'workspace/canvas/pagename.html', content:'<!DOCTYPE html>...'}). "
130
+ "Step 2 — open it in your spoken response: 'Here it is. [CANVAS:pagename]' "
131
+ "Step 3 — verify it opened: exec('curl -s http://openvoiceui:5001/api/canvas/context') "
132
+ "returns {current_page, current_title}. If current_page matches → confirm to user. "
133
+ "If still old page → say so and resend [CANVAS:pagename]. If null → say 'Opening canvas now.' and resend. "
134
+
135
+ # --- Canvas: HTML rules ---
136
+ "CANVAS HTML RULES (mandatory for every canvas page you create): "
137
+ "NO external CDN scripts — Tailwind CDN, Bootstrap CDN, any <script src='https://...'> are BANNED (break in sandboxed iframes). "
138
+ "All CSS and JS must be inline in <style> and <script> tags only. "
139
+ "Google Fonts @import url(...) in <style> is OK. "
140
+ "Dark theme: background #0d1117 or #13141a, text #e2e8f0, accent blue #3b82f6 or amber #f59e0b. "
141
+ "Body: padding:20px; color:#e2e8f0; background:#0a0a0a; "
142
+ "Make pages visual — cards, grids, tables, real data. No blank pages. "
143
+
144
+ # --- Canvas: interactive buttons ---
145
+ "CANVAS INTERACTIVE BUTTONS — use postMessage, never href='#': "
146
+ "Trigger AI action: onclick=\"window.parent.postMessage({type:'canvas-action',action:'speak',text:'your message'},'*')\" "
147
+ "Open another page: onclick=\"window.parent.postMessage({type:'canvas-action',action:'navigate',page:'page-id'},'*')\" "
148
+ "Open page menu: onclick=\"window.parent.postMessage({type:'canvas-action',action:'menu'},'*')\" "
149
+ "Close canvas: onclick=\"window.parent.postMessage({type:'canvas-action',action:'close'},'*')\" "
150
+ "External links: use <a href='https://...' target='_blank'> — never href='#'. "
151
+
152
+ # --- Canvas: make public ---
153
+ "MAKE A PAGE PUBLIC (shareable without login): "
154
+ "exec('curl -s -X PATCH http://openvoiceui:5001/api/canvas/manifest/page/PAGE_ID "
155
+ "-H \"Content-Type: application/json\" -d \\'{{\"is_public\": true}}\\'') "
156
+ "Shareable URL format: https://DOMAIN/pages/pagename.html "
157
+
158
+ # --- Music ---
159
+ "MUSIC TAGS: "
160
+ "[MUSIC_PLAY] — play a random track. "
161
+ "[MUSIC_PLAY:track name] — play specific track (use exact title from [Available tracks:] list above). "
162
+ "[MUSIC_STOP] — stop music. "
163
+ "[MUSIC_NEXT] — skip to next track. "
164
+ "Only use music tags when the user explicitly asks — "
165
+ "EXCEPT: when opening a music-related canvas page (music-list, playlist, library, etc.), "
166
+ "also send [MUSIC_PLAY] in the same response so music starts playing alongside the page. "
167
+
168
+ # --- Suno song generation ---
169
+ "SONG GENERATION: "
170
+ "[SUNO_GENERATE:description] — generates an AI song (~45 seconds). "
171
+ "Always say something like 'I'll get that cooking now, should be ready in about 45 seconds!' "
172
+ "The frontend handles Suno — do NOT call any Suno APIs yourself. "
173
+ "After generation, the new song appears in [Available tracks:] by its title. "
174
+ "Use [MUSIC_PLAY:song title] to play it — do NOT use exec/shell to find the file. "
175
+
176
+ # --- Spotify ---
177
+ "SPOTIFY: [SPOTIFY:song name] or [SPOTIFY:song name|artist name] — plays from Spotify. "
178
+ "Example: [SPOTIFY:Bohemian Rhapsody|Queen]. Only use when user specifically asks. "
179
+
180
+ # --- Sleep / goodbye ---
181
+ "SLEEP: [SLEEP] — puts interface into passive wake-word mode. "
182
+ "Use when user says goodbye, goodnight, stop listening, go to sleep, I'm out, peace, later, or similar. "
183
+ "Always give a brief farewell (1-2 sentences) BEFORE the [SLEEP] tag. "
184
+ "NEVER acknowledge that you 'should' sleep without including the [SLEEP] tag — the tag IS the action. "
185
+
186
+ # --- Session reset ---
187
+ "[SESSION_RESET] — clears conversation history and starts fresh. "
188
+ "Use sparingly — only when context is clearly broken or user explicitly asks to start over. "
189
+
190
+ # --- DJ soundboard ---
191
+ "DJ SOUNDBOARD: [SOUND:name] — plays a sound effect. "
192
+ "ONLY use in DJ mode (user explicitly said 'be a DJ', 'DJ mode', or 'put on a set'). "
193
+ "NEVER use in normal conversation. "
194
+ "Available sounds: air_horn, scratch_long, rewind, record_stop, crowd_cheer, crowd_hype, "
195
+ "yeah, lets_go, gunshot, bruh, sad_trombone. "
196
+
197
+ # --- Onboarding notifications ---
198
+ "ONBOARDING NOTIFICATIONS (popup at top-center of screen): "
199
+ "[NOTIFY:message] — show/update popup message. "
200
+ "[NOTIFY_TITLE:text] — update popup title bar. "
201
+ "[NOTIFY_PROGRESS:N/M] — show step progress dots (e.g. [NOTIFY_PROGRESS:2/5]). "
202
+ "[NOTIFY_STATUS:text] — update small status line (e.g. '3 agents working...'). "
203
+ "[NOTIFY_CLOSE] — hide popup temporarily. "
204
+ "[NOTIFY_COMPLETE] — mark onboarding done (shows success, then auto-dismisses). "
205
+
206
+ # --- Face registration ---
207
+ "[REGISTER_FACE:Name] — captures and saves the person's face from camera. "
208
+ "Only use when someone explicitly asks or introduces themselves. "
209
+ "If camera is off, let them know. "
210
+
211
+ # --- Camera vision ---
212
+ "CAMERA VISION: When a [CAMERA VISION: ...] tag appears in the context above, "
213
+ "it describes what the camera currently sees. Use it to answer the user's question naturally — "
214
+ "do not repeat the raw description verbatim. If it says camera is off, let the user know. "
215
+
216
+ "]"
217
+ )
218
+
219
+
220
+ def _is_vision_request(msg: str) -> bool:
221
+ """Return True if the user message looks like a request to use the camera/vision."""
222
+ lower = msg.lower()
223
+ return any(kw in lower for kw in _VISION_KEYWORDS)
224
+
225
+
226
+ def _cap_list(items, max_chars=2000, label="items"):
227
+ """Join items with ', ' but cap at max_chars. Add '... and N more' if truncated."""
228
+ if not items:
229
+ return "none"
230
+ result = []
231
+ total = 0
232
+ for item in items:
233
+ addition = len(item) + (2 if result else 0) # ', ' separator
234
+ if total + addition > max_chars and result:
235
+ remaining = len(items) - len(result)
236
+ result.append(f"... and {remaining} more")
237
+ break
238
+ result.append(item)
239
+ total += addition
240
+ return ', '.join(result)
241
+
242
+
243
+ # ---------------------------------------------------------------------------
244
+ # DB write queue — background thread so DB writes don't block HTTP responses
245
+ # (FIND-01 / FIND-08 fix from performance audit)
246
+ # ---------------------------------------------------------------------------
247
+
248
+ _db_write_queue: queue.Queue = queue.Queue()
249
+
250
+
251
+ def _db_writer_loop():
252
+ """Background daemon that drains _db_write_queue and writes to SQLite.
253
+
254
+ Queue items: (db_path_str, sql, params).
255
+ db_path_str is resolved at enqueue time so test patches to DB_PATH work.
256
+ Connections are cached per db_path to reuse WAL-mode connections.
257
+ """
258
+ connections: dict = {}
259
+ while True:
260
+ try:
261
+ db_path_str, sql, params = _db_write_queue.get(timeout=5)
262
+ except queue.Empty:
263
+ continue
264
+ try:
265
+ if db_path_str not in connections:
266
+ conn = sqlite3.connect(db_path_str, check_same_thread=False, timeout=30)
267
+ conn.execute("PRAGMA journal_mode=WAL")
268
+ conn.execute("PRAGMA synchronous=NORMAL")
269
+ conn.execute("PRAGMA cache_size=-64000")
270
+ conn.execute("PRAGMA busy_timeout=30000")
271
+ connections[db_path_str] = conn
272
+ connections[db_path_str].execute(sql, params)
273
+ connections[db_path_str].commit()
274
+ except Exception as e:
275
+ logger.error(f"[db-writer] loop error: {e}")
276
+ finally:
277
+ _db_write_queue.task_done()
278
+
279
+
280
+ _db_writer_thread = threading.Thread(
281
+ target=_db_writer_loop,
282
+ name="conv-db-writer",
283
+ daemon=True,
284
+ )
285
+ _db_writer_thread.start()
286
+
287
+
288
+ def _flush_db_writes(timeout: float = 5.0) -> None:
289
+ """Block until all queued DB writes are processed. For use in tests."""
290
+ _db_write_queue.join()
291
+
292
+ # ---------------------------------------------------------------------------
293
+ # In-memory session key cache (FIND-02 fix from performance audit)
294
+ # ---------------------------------------------------------------------------
295
+
296
+ _session_key_cache: str | None = None
297
+ _session_key_lock = threading.Lock()
298
+ _session_recovery_key: str | None = None # Set after double-empty to escape poisoned session
299
+
300
+ # ---------------------------------------------------------------------------
301
+ # Conversation state (module-level singletons)
302
+ # ---------------------------------------------------------------------------
303
+
304
+ #: In-process conversation history keyed by session_id.
305
+ #: Cleared on conversation reset; also restored from DB on first access.
306
+ conversation_histories: dict = {}
307
+
308
+ #: Tracks consecutive empty Gateway responses for auto-reset logic.
309
+ _consecutive_empty_responses: int = 0
310
+
311
+ # ---------------------------------------------------------------------------
312
+ # Voice session management
313
+ # (moved here from server.py so the blueprint owns the session counter)
314
+ # ---------------------------------------------------------------------------
315
+
316
+
317
+ def _save_session_counter(counter: int) -> None:
318
+ with open(VOICE_SESSION_FILE, 'w') as f:
319
+ f.write(str(counter))
320
+
321
+
322
+ def get_voice_session_key() -> str:
323
+ """Return the current voice session key.
324
+
325
+ Uses a STABLE key (no incrementing counter) so the Z.AI prompt cache
326
+ stays warm across session resets. OpenClaw's daily reset handles context
327
+ clearing — we don't need a new key for that.
328
+
329
+ If the session is poisoned (double-empty detected), returns a recovery key
330
+ to force openclaw onto a fresh session. Cleared on first successful response.
331
+
332
+ Priority: recovery key → GATEWAY_SESSION_KEY env → VOICE_SESSION_PREFIX env → 'voice-main'
333
+ Cache is invalidated by bump_voice_session() (explicit agent reset only).
334
+ """
335
+ global _session_key_cache
336
+ # Auto-clear stale recovery keys (stuck >60s)
337
+ _check_recovery_timeout()
338
+ # If session is poisoned, use recovery key to escape
339
+ if _session_recovery_key is not None:
340
+ return _session_recovery_key
341
+ if _session_key_cache is not None:
342
+ return _session_key_cache
343
+ with _session_key_lock:
344
+ if _session_key_cache is not None:
345
+ return _session_key_cache
346
+ # Use GATEWAY_SESSION_KEY if set (unique per user), else prefix
347
+ _gw_key = os.getenv('GATEWAY_SESSION_KEY')
348
+ if _gw_key:
349
+ _session_key_cache = _gw_key
350
+ else:
351
+ _prefix = os.getenv('VOICE_SESSION_PREFIX', 'voice-main')
352
+ _session_key_cache = _prefix
353
+ return _session_key_cache
354
+
355
+
356
+ def bump_voice_session() -> str:
357
+ """Increment the session counter and invalidate the cache so the key
358
+ is re-read from GATEWAY_SESSION_KEY on next call.
359
+
360
+ The counter file is still incremented for logging/tracking how many
361
+ resets have occurred, but the actual session key stays stable (e.g.
362
+ 'main') so it matches the heartbeat session and keeps the Z.AI prompt
363
+ cache warm.
364
+ """
365
+ global _consecutive_empty_responses, _session_key_cache
366
+ try:
367
+ with open(VOICE_SESSION_FILE, 'r') as f:
368
+ counter = int(f.read().strip())
369
+ except (FileNotFoundError, ValueError):
370
+ counter = 6
371
+ counter += 1
372
+ _save_session_counter(counter)
373
+ _consecutive_empty_responses = 0
374
+ with _session_key_lock:
375
+ _session_key_cache = None # invalidate cache; next call re-reads env var
376
+ stable_key = get_voice_session_key()
377
+ logger.info(f'### SESSION RESET #{counter}: cache invalidated, key stays stable as "{stable_key}"')
378
+ return stable_key
379
+
380
+
381
+ _recovery_entered_at: float = 0
382
+
383
+
384
+ def _enter_session_recovery():
385
+ """Switch to a temporary recovery session key after double-empty.
386
+ Openclaw will create a fresh session for this key, escaping the
387
+ poisoned state. The recovery key is cleared on the first successful
388
+ (non-empty, non-fallback) response."""
389
+ global _session_recovery_key, _recovery_entered_at
390
+ import datetime
391
+ # Cooldown: don't thrash recovery keys from rapid start/stop cycles
392
+ now = time.time()
393
+ if now - _recovery_entered_at < 30:
394
+ logger.info('### SESSION RECOVERY: skipping — cooldown active (entered <30s ago)')
395
+ return
396
+ _recovery_entered_at = now
397
+ _session_recovery_key = f'recovery-{int(datetime.datetime.utcnow().timestamp())}'
398
+ logger.warning(f'### SESSION RECOVERY: switching to key "{_session_recovery_key}" to escape poisoned session')
399
+
400
+
401
+ def _exit_session_recovery():
402
+ """Clear the recovery key after a successful response.
403
+ Next request goes back to the stable key (cache-warm path)."""
404
+ global _session_recovery_key
405
+ if _session_recovery_key is not None:
406
+ old_recovery = _session_recovery_key
407
+ _session_recovery_key = None
408
+ stable = get_voice_session_key()
409
+ logger.info(f'### SESSION RECOVERY CLEARED: "{old_recovery}" → back to stable key "{stable}"')
410
+
411
+
412
+ def _check_recovery_timeout():
413
+ """Auto-clear stale recovery keys. If recovery has been active for >60s
414
+ without a successful response, the recovery key itself may be stuck.
415
+ Fall back to stable key."""
416
+ global _session_recovery_key, _recovery_entered_at
417
+ if _session_recovery_key is not None and time.time() - _recovery_entered_at > 60:
418
+ logger.warning(f'### SESSION RECOVERY TIMEOUT: "{_session_recovery_key}" active for >60s — clearing')
419
+ _session_recovery_key = None
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Helper: notify Brain (non-critical fire-and-forget)
424
+ # ---------------------------------------------------------------------------
425
+
426
+
427
+ def _notify_brain(event_type: str, **data) -> None:
428
+ """Append an event to the Brain events file for context tracking."""
429
+ try:
430
+ event = {'type': event_type, 'timestamp': datetime.now().isoformat()}
431
+ event.update(data)
432
+ with open(BRAIN_EVENTS_PATH, 'a') as f:
433
+ f.write(json.dumps(event) + '\n')
434
+ except Exception:
435
+ pass # Non-critical
436
+
437
+ # ---------------------------------------------------------------------------
438
+ # Helper: log conversation to SQLite
439
+ # ---------------------------------------------------------------------------
440
+
441
+
442
+ def log_conversation(role: str, message: str, session_id: str = 'default',
443
+ tts_provider: str = None, voice: str = None) -> None:
444
+ """Log a single conversation turn to the database (non-blocking).
445
+
446
+ Write is queued to the background db-writer thread (FIND-01 fix).
447
+ """
448
+ _db_write_queue.put((
449
+ str(DB_PATH),
450
+ 'INSERT INTO conversation_log '
451
+ '(session_id, role, message, tts_provider, voice, created_at) '
452
+ 'VALUES (?, ?, ?, ?, ?, ?)',
453
+ (session_id, role, message, tts_provider, voice, datetime.now().isoformat()),
454
+ ))
455
+ _notify_brain('conversation', role=role, message=message, session=session_id)
456
+
457
+ # ---------------------------------------------------------------------------
458
+ # Helper: log timing metrics
459
+ # ---------------------------------------------------------------------------
460
+
461
+
462
+ def log_metrics(metrics: dict) -> None:
463
+ """Log conversation timing metrics to SQLite + journalctl (non-blocking).
464
+
465
+ Write is queued to the background db-writer thread (FIND-01 fix).
466
+ """
467
+ logger.info(
468
+ f"[METRICS] profile={metrics.get('profile')} "
469
+ f"handshake={metrics.get('handshake_ms')}ms "
470
+ f"llm={metrics.get('llm_inference_ms')}ms "
471
+ f"tts={metrics.get('tts_generation_ms')}ms "
472
+ f"total={metrics.get('total_ms')}ms "
473
+ f"resp_len={metrics.get('response_len')} "
474
+ f"tts_ok={metrics.get('tts_success', 1)} "
475
+ f"tools={metrics.get('tool_count', 0)} "
476
+ f"fallback={metrics.get('fallback_used', 0)}"
477
+ )
478
+ _db_write_queue.put((
479
+ str(DB_PATH),
480
+ '''INSERT INTO conversation_metrics
481
+ (session_id, profile, model, handshake_ms, llm_inference_ms,
482
+ tts_generation_ms, total_ms, user_message_len, response_len,
483
+ tts_text_len, tts_provider, tts_success, tts_error,
484
+ tool_count, fallback_used, error, created_at)
485
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
486
+ (
487
+ metrics.get('session_id', 'default'),
488
+ metrics.get('profile', 'unknown'),
489
+ metrics.get('model', 'unknown'),
490
+ metrics.get('handshake_ms'),
491
+ metrics.get('llm_inference_ms'),
492
+ metrics.get('tts_generation_ms'),
493
+ metrics.get('total_ms'),
494
+ metrics.get('user_message_len'),
495
+ metrics.get('response_len'),
496
+ metrics.get('tts_text_len'),
497
+ metrics.get('tts_provider'),
498
+ metrics.get('tts_success', 1),
499
+ metrics.get('tts_error'),
500
+ metrics.get('tool_count', 0),
501
+ metrics.get('fallback_used', 0),
502
+ metrics.get('error'),
503
+ datetime.now().isoformat(),
504
+ ),
505
+ ))
506
+
507
+ # ---------------------------------------------------------------------------
508
+ # Helper: clean text for TTS
509
+ # ---------------------------------------------------------------------------
510
+
511
+
512
+ def _truncate_at_sentence(text: str, max_chars: int) -> str:
513
+ """Truncate text at the nearest sentence boundary at or before max_chars.
514
+ Falls back to hard truncation if no boundary is found."""
515
+ if not text or len(text) <= max_chars:
516
+ return text
517
+ chunk = text[:max_chars]
518
+ # Find last sentence-ending punctuation before the cap
519
+ last_boundary = max(chunk.rfind('.'), chunk.rfind('!'), chunk.rfind('?'))
520
+ if last_boundary > 0:
521
+ return chunk[:last_boundary + 1].strip()
522
+ return chunk.strip()
523
+
524
+
525
+ def clean_for_tts(text: str) -> str:
526
+ """Remove markdown, reasoning tokens, and non-speech characters for TTS."""
527
+ if not text:
528
+ return ''
529
+
530
+ # Strip GPT-OSS-120B reasoning tokens (but not if NO/YES is the full response)
531
+ if text.strip().upper() not in ['NO', 'YES', 'NO.', 'YES.']:
532
+ text = re.sub(r'^NO_REPLY\s*', '', text)
533
+ text = re.sub(r'\s+NO\s*$', '', text, flags=re.IGNORECASE)
534
+ text = re.sub(r'\s+YES\s*$', '', text, flags=re.IGNORECASE)
535
+
536
+ # Remove canvas/task/music triggers (handled by frontend, not spoken)
537
+ text = re.sub(r'\[CANVAS_MENU\]', '', text, flags=re.IGNORECASE)
538
+ text = re.sub(r'\[CANVAS:[^\]]*\]', '', text, flags=re.IGNORECASE)
539
+ text = re.sub(r'\[CANVAS_URL:[^\]]*\]', '', text, flags=re.IGNORECASE)
540
+ text = re.sub(r'\[MUSIC_PLAY(?::[^\]]*)?\]', '', text, flags=re.IGNORECASE)
541
+ text = re.sub(r'\[MUSIC_STOP\]', '', text, flags=re.IGNORECASE)
542
+ text = re.sub(r'\[MUSIC_NEXT\]', '', text, flags=re.IGNORECASE)
543
+ text = re.sub(r'\[SUNO_GENERATE:[^\]]*\]', '', text, flags=re.IGNORECASE)
544
+ text = re.sub(r'\[SLEEP\]', '', text, flags=re.IGNORECASE)
545
+ text = re.sub(r'\[REGISTER_FACE:[^\]]*\]', '', text, flags=re.IGNORECASE)
546
+ text = re.sub(r'\[SPOTIFY:[^\]]*\]', '', text, flags=re.IGNORECASE)
547
+ text = re.sub(r'\[SOUND:[^\]]*\]', '', text, flags=re.IGNORECASE)
548
+ text = re.sub(r'\[SESSION_RESET\]', '', text, flags=re.IGNORECASE)
549
+
550
+ # Remove code blocks (complete fences first, then any unclosed fence to end of text)
551
+ text = re.sub(r'```[\s\S]*?```', '', text)
552
+ text = re.sub(r'```[\s\S]*', '', text)
553
+ text = re.sub(r'`[^`]+`', '', text)
554
+
555
+ # Add natural pauses for structured content (must happen before stripping markdown)
556
+ text = re.sub(r'^(#+\s+.+?)([^.!?])\s*$', r'\1\2.', text, flags=re.MULTILINE)
557
+
558
+ def _ensure_list_item_pause(match):
559
+ prefix = match.group(1)
560
+ content = match.group(2).strip()
561
+ if content and content[-1] not in '.!?:':
562
+ content += '.'
563
+ return f'{prefix} {content}'
564
+ text = re.sub(r'^(\s*\d+[.)]\s*)(.+?)$', _ensure_list_item_pause,
565
+ text, flags=re.MULTILINE)
566
+
567
+ def _ensure_bullet_pause(match):
568
+ content = match.group(1).strip()
569
+ if content and content[-1] not in '.!?:':
570
+ content += '.'
571
+ return content
572
+ text = re.sub(r'^\s*[-*•]\s+(.+?)$', _ensure_bullet_pause,
573
+ text, flags=re.MULTILINE)
574
+
575
+ def _table_row_to_speech(match):
576
+ row = match.group(0)
577
+ if re.match(r'^[\s|:-]+$', row):
578
+ return ''
579
+ cells = [c.strip() for c in row.split('|') if c.strip()]
580
+ if not cells:
581
+ return ''
582
+ return ', '.join(cells) + '.'
583
+ text = re.sub(r'^\|.+\|$', _table_row_to_speech, text, flags=re.MULTILINE)
584
+
585
+ lines = text.split('\n')
586
+ for i, line in enumerate(lines):
587
+ stripped = line.strip()
588
+ if stripped and len(stripped) < 80 and stripped[-1] not in '.!?:,;':
589
+ if re.match(r'^[A-Za-z0-9]', stripped):
590
+ lines[i] = stripped + '.'
591
+ text = '\n'.join(lines)
592
+
593
+ # Strip markdown formatting
594
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
595
+ text = re.sub(r'\*([^*]+)\*', r'\1', text)
596
+ text = re.sub(r'__([^_]+)__', r'\1', text)
597
+ text = re.sub(r'_([^_]+)_', r'\1', text)
598
+ text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
599
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
600
+ text = re.sub(r'https?://\S+', '', text)
601
+ text = re.sub(r'/[\w/.-]+', '', text)
602
+
603
+ # Expand acronyms to speakable form
604
+ acronyms = {
605
+ 'API': 'api', 'HTML': 'html', 'CSS': 'css', 'JSON': 'jason',
606
+ 'HTTP': 'http', 'HTTPS': 'https', 'URL': 'url', 'TTS': 'text to speech',
607
+ 'STT': 'speech to text', 'LLM': 'large language model', 'AI': 'A.I.',
608
+ 'UI': 'user interface', 'UX': 'user experience', 'RAM': 'ram',
609
+ 'CPU': 'cpu', 'GPU': 'gpu', 'DB': 'database', 'VPS': 'server',
610
+ 'SSH': 'ssh', 'CLI': 'command line', 'SDK': 'sdk', 'API': 'api',
611
+ }
612
+ for acronym, expansion in acronyms.items():
613
+ text = re.sub(r'\b' + acronym + r'\b', expansion, text)
614
+
615
+ # Replace symbols with spoken equivalents
616
+ text = text.replace('&', ' and ')
617
+ text = text.replace('%', ' percent ')
618
+ text = text.replace('$', ' dollars ')
619
+ text = text.replace('@', ' at ')
620
+ text = text.replace('#', ' number ')
621
+ text = text.replace('+', ' plus ')
622
+ text = text.replace('=', ' equals ')
623
+
624
+ # Clean up whitespace
625
+ text = re.sub(r'\n+', '. ', text)
626
+ text = re.sub(r'\.{2,}', '.', text)
627
+ text = re.sub(r'\s+', ' ', text).strip()
628
+ text = re.sub(r'\.\s*\.', '.', text)
629
+ # Strip leading punctuation/spaces (e.g. from [MUSIC_STOP]\n\n → ". text")
630
+ text = re.sub(r'^[.,;:\s]+', '', text)
631
+
632
+ return text
633
+
634
+ # ---------------------------------------------------------------------------
635
+ # Helper: legacy Supertonic voice accessor
636
+ # ---------------------------------------------------------------------------
637
+
638
+
639
+ def get_supertonic_for_voice(voice_style: str):
640
+ """Get Supertonic provider (voice_style ignored — unified provider)."""
641
+ return get_provider('supertonic')
642
+
643
+ # ---------------------------------------------------------------------------
644
+ # Blueprint
645
+ # ---------------------------------------------------------------------------
646
+
647
+ conversation_bp = Blueprint('conversation', __name__)
648
+
649
+ # ---------------------------------------------------------------------------
650
+ # POST /api/conversation — main voice conversation endpoint
651
+ # ---------------------------------------------------------------------------
652
+
653
+
654
+ @conversation_bp.route('/api/conversation', methods=['POST'])
655
+ def conversation():
656
+ """
657
+ Handle voice conversation flow.
658
+
659
+ Request JSON:
660
+ message : str — transcribed user speech (required)
661
+ tts_provider : str — 'supertonic' | 'groq' (default: env DEFAULT_TTS_PROVIDER or groq)
662
+ voice : str — voice ID, e.g. 'M1' (default: M1)
663
+ session_id : str — session identifier (default: default)
664
+ ui_context : dict — canvas/music state from frontend (optional)
665
+
666
+ Response JSON (non-streaming):
667
+ response : str — AI text response
668
+ audio : str — base64-encoded audio (if TTS succeeds)
669
+ timing : dict — handshake/llm/tts/total ms
670
+ actions : list — Gateway tool/lifecycle events (optional)
671
+ """
672
+ try:
673
+ return _conversation_inner()
674
+ except Exception as e:
675
+ import traceback
676
+ tb = traceback.format_exc()
677
+ logger.error(f'FATAL: {tb}')
678
+ return jsonify({
679
+ 'response': 'Something went wrong on my end. Try again?',
680
+ 'error': 'Internal server error'
681
+ }), 500
682
+
683
+
684
+ def _conversation_inner():
685
+ global _consecutive_empty_responses
686
+
687
+ t_request_start = time.time()
688
+ metrics = {
689
+ 'profile': 'gateway',
690
+ 'model': 'glm-4.7-flash',
691
+ 'tts_success': 1,
692
+ 'fallback_used': 0,
693
+ 'tool_count': 0,
694
+ }
695
+
696
+ data = request.get_json()
697
+ if not data:
698
+ logger.error('ERROR: No JSON data in request')
699
+ return jsonify({'error': 'No JSON data provided'}), 400
700
+
701
+ logger.info(f'Received conversation request: {data}')
702
+
703
+ user_message = data.get('message', '').strip()
704
+ tts_provider = data.get('tts_provider') or os.getenv('DEFAULT_TTS_PROVIDER', 'groq')
705
+ voice = data.get('voice', 'M1')
706
+ session_id = data.get('session_id', 'default')
707
+ ui_context = data.get('ui_context', {})
708
+ identified_person = data.get('identified_person') or None
709
+ agent_id = data.get('agent_id') or None # e.g. 'default'; None = default 'main'
710
+ gateway_id = data.get('gateway_id') or None # plugin gateway id; None = 'openclaw'
711
+ max_response_chars = data.get('max_response_chars') or None # profile cap, truncates at sentence boundary
712
+ image_path = data.get('image_path') or None # uploaded image for vision analysis
713
+ metrics['session_id'] = session_id
714
+ metrics['user_message_len'] = len(user_message)
715
+ metrics['tts_provider'] = tts_provider
716
+
717
+ if not user_message:
718
+ return jsonify({'error': 'No message provided'}), 400
719
+
720
+ # Filter garbage STT fragments — punctuation-only, single short words, noise
721
+ import re as _re
722
+ _meaningful_chars = _re.sub(r'[^a-zA-Z0-9]', '', user_message)
723
+ if len(_meaningful_chars) < 3:
724
+ logger.info(f'### FILTERED garbage STT: "{user_message}" ({len(_meaningful_chars)} meaningful chars)')
725
+ # Return a no-op stream that ends cleanly — no fallback message shown
726
+ def _noop_stream():
727
+ yield "data: " + json.dumps({"type": "filtered", "reason": "garbage_stt"}) + "\n\n"
728
+ yield "data: " + json.dumps({"type": "text_done", "response": " "}) + "\n\n"
729
+ return Response(_noop_stream(), mimetype='text/event-stream')
730
+
731
+ # Input length guard (P7-T3 security audit)
732
+ if len(user_message) > 4000:
733
+ return jsonify({'error': 'Message too long (max 4000 characters)'}), 400
734
+
735
+ wants_stream = (
736
+ request.args.get('stream') == '1'
737
+ or request.headers.get('X-Stream-Response') == '1'
738
+ )
739
+
740
+ # Update canvas context from UI state
741
+ if ui_context.get('canvasDisplayed'):
742
+ update_canvas_context(
743
+ ui_context['canvasDisplayed'],
744
+ title=ui_context['canvasDisplayed']
745
+ .replace('/pages/', '')
746
+ .replace('.html', '')
747
+ .replace('-', ' ')
748
+ .title()
749
+ )
750
+
751
+ # Build context prefix from UI state
752
+ t_context_start = time.time()
753
+ context_prefix = ''
754
+ context_parts = []
755
+
756
+ # Inject face recognition identity
757
+ if identified_person and identified_person.get('name') and identified_person.get('name') != 'unknown':
758
+ name = identified_person['name']
759
+ confidence = identified_person.get('confidence', 0)
760
+ context_parts.append(
761
+ f'[FACE RECOGNITION: The person you are speaking with has been identified as {name} '
762
+ f'({confidence}% confidence). Address them by name naturally.]'
763
+ )
764
+
765
+ # Vision: if user asks about what the camera sees, call vision model with latest frame
766
+ if _is_vision_request(user_message):
767
+ from routes.vision import _latest_frame, _call_vision
768
+ _frame_img = _latest_frame.get('image')
769
+ _frame_age = time.time() - _latest_frame.get('ts', 0)
770
+ if _frame_img and _frame_age < _VISION_FRAME_MAX_AGE:
771
+ try:
772
+ _vision_desc = _call_vision(
773
+ _frame_img,
774
+ 'Describe what you see in this image concisely. Focus on people, objects, and actions.',
775
+ )
776
+ context_parts.append(f'[CAMERA VISION: {_vision_desc}]')
777
+ except Exception as exc:
778
+ logger.warning('Vision analysis failed: %s', exc)
779
+ context_parts.append('[CAMERA VISION: Camera is on but vision analysis failed.]')
780
+ elif not _frame_img:
781
+ context_parts.append('[CAMERA VISION: No camera frame available — camera may be off.]')
782
+ else:
783
+ context_parts.append('[CAMERA VISION: Camera frame is stale — camera may have been turned off.]')
784
+
785
+ # Vision: if user uploaded an image, analyze it with vision model
786
+ if image_path:
787
+ try:
788
+ _img_file = Path(image_path).resolve()
789
+ # Security: only allow files inside uploads/ directories
790
+ if 'uploads' not in _img_file.parts:
791
+ raise ValueError(f'Path traversal blocked: {image_path}')
792
+ if _img_file.is_file() and _img_file.stat().st_size < 20_000_000: # 20MB safety cap
793
+ from routes.vision import _call_vision
794
+ _img_b64 = base64.b64encode(_img_file.read_bytes()).decode('ascii')
795
+ _upload_desc = _call_vision(
796
+ _img_b64,
797
+ 'Describe what you see in this image in detail. Include colors, objects, text, people, layout, and any notable features.',
798
+ )
799
+ context_parts.append(f'[UPLOADED IMAGE ANALYSIS: {_upload_desc}]')
800
+ logger.info('Vision analysis of uploaded image succeeded (%d bytes)', _img_file.stat().st_size)
801
+ else:
802
+ logger.warning('Uploaded image not found or too large: %s', image_path)
803
+ context_parts.append('[UPLOADED IMAGE: File could not be analyzed — may be too large or missing.]')
804
+ except Exception as exc:
805
+ logger.warning('Vision analysis of uploaded image failed: %s', exc)
806
+ context_parts.append('[UPLOADED IMAGE: Vision analysis failed — the image was uploaded but could not be analyzed.]')
807
+
808
+ if ui_context:
809
+ # Canvas state
810
+ if ui_context.get('canvasVisible') and ui_context.get('canvasDisplayed'):
811
+ page_name = (ui_context['canvasDisplayed']
812
+ .replace('/pages/', '')
813
+ .replace('.html', '')
814
+ .replace('-', ' '))
815
+ context_parts.append(f'[Canvas OPEN: {page_name}]')
816
+ elif not ui_context.get('canvasVisible'):
817
+ context_parts.append('[Canvas CLOSED]')
818
+ if ui_context.get('canvasMenuOpen'):
819
+ context_parts.append('[Canvas menu visible to user]')
820
+ # Canvas JS errors — auto-injected from browser error buffer
821
+ canvas_errors = ui_context.get('canvasErrors', [])
822
+ if canvas_errors:
823
+ err_str = ' | '.join(canvas_errors)
824
+ context_parts.append(f'[Canvas JS Errors: {err_str}]')
825
+
826
+ # Music state (server-side is authoritative)
827
+ _srv_track = _music_state.get('current_track')
828
+ _srv_playing = _music_state.get('playing', False)
829
+ if _srv_playing and _srv_track:
830
+ _track_name = _srv_track.get('title') or _srv_track.get('name', 'unknown')
831
+ context_parts.append(f'[Music PLAYING: {_track_name}]')
832
+ elif _srv_track:
833
+ _track_name = _srv_track.get('title') or _srv_track.get('name', 'unknown')
834
+ context_parts.append(f'[Music PAUSED/STOPPED — last track: {_track_name}]')
835
+ elif ui_context.get('musicPlaying'):
836
+ track = ui_context.get('musicTrack', 'unknown')
837
+ context_parts.append(f'[Music PLAYING: {track}]')
838
+
839
+ # Available music tracks (so agent can use [MUSIC_PLAY:exact name])
840
+ try:
841
+ from routes.music import get_music_files
842
+ _lib_tracks = get_music_files('library')
843
+ _gen_tracks = get_music_files('generated')
844
+ _lib_names = [t.get('title') or t.get('name', '') for t in _lib_tracks]
845
+ _gen_names = [t.get('title') or t.get('name', '') for t in _gen_tracks]
846
+ _lib_names = [n for n in _lib_names if n]
847
+ _gen_names = [n for n in _gen_names if n]
848
+ _parts = []
849
+ if _lib_names:
850
+ _parts.append(f'Library ({len(_lib_names)}): {_cap_list(_lib_names, max_chars=2000)}')
851
+ if _gen_names:
852
+ _parts.append(f'Generated ({len(_gen_names)}): {_cap_list(_gen_names, max_chars=2000)}')
853
+ if _parts:
854
+ context_parts.append(f'[Available tracks — {" | ".join(_parts)}]')
855
+ except Exception:
856
+ pass
857
+
858
+ # Recently completed Suno generations — agent gets notified on next turn
859
+ try:
860
+ from routes.suno import completed_songs_queue
861
+ if completed_songs_queue:
862
+ _pending = completed_songs_queue[-3:]
863
+ _titles = [s.get('title', 'Unknown Track') for s in _pending]
864
+ context_parts.append(f'[Suno just finished: {", ".join(repr(t) for t in _titles)} — now ready in Generated playlist]')
865
+ except Exception:
866
+ pass
867
+
868
+ # Available canvas pages (agent needs IDs for [CANVAS:page-id])
869
+ try:
870
+ from routes.canvas import load_canvas_manifest
871
+ _manifest = load_canvas_manifest()
872
+ _page_ids = sorted(_manifest.get('pages', {}).keys())
873
+ _page_list = _cap_list(_page_ids, max_chars=1000)
874
+ except Exception:
875
+ _page_list = 'unknown'
876
+ context_parts.append(f'[Canvas pages: {_page_list}]')
877
+
878
+ # Available DJ sounds (for [SOUND:name] in DJ mode)
879
+ context_parts.append(
880
+ '[DJ sounds: air_horn, scratch_long, rewind, record_stop, '
881
+ 'crowd_cheer, crowd_hype, yeah, lets_go, gunshot, bruh, sad_trombone]'
882
+ )
883
+ # Inject active profile's custom system_prompt (admin editor → runtime)
884
+ # Also read min_sentence_chars for TTS sentence extraction.
885
+ _min_sentence_chars = 40 # default — prevents choppy short TTS fragments
886
+ try:
887
+ from profiles.manager import get_profile_manager
888
+ from routes.profiles import _active_profile_id
889
+ _mgr = get_profile_manager()
890
+ _prof = _mgr.get_profile(_active_profile_id)
891
+ if _prof and _prof.system_prompt and _prof.system_prompt.strip():
892
+ context_parts.append(f'[PROFILE INSTRUCTIONS: {_prof.system_prompt.strip()}]')
893
+ if _prof and hasattr(_prof, 'voice') and _prof.voice and _prof.voice.min_sentence_chars:
894
+ _min_sentence_chars = _prof.voice.min_sentence_chars
895
+ except Exception:
896
+ pass # Profile system not available — skip gracefully
897
+
898
+ # Inject voice assistant instructions so the agent knows about action tags.
899
+ # This must be in-app (not workspace files) so it works out of the box.
900
+ context_parts.append(_load_voice_system_prompt())
901
+
902
+ if context_parts:
903
+ context_prefix = ' '.join(context_parts) + ' '
904
+
905
+ t_context_ms = int((time.time() - t_context_start) * 1000)
906
+ if t_context_ms > 50:
907
+ logger.info(f"### CONTEXT BUILD TIMING: {t_context_ms}ms ({len(context_parts)} parts, {len(context_prefix)} chars)")
908
+
909
+ log_conversation('user', user_message, session_id=session_id,
910
+ tts_provider=tts_provider, voice=voice)
911
+
912
+ # Replace the legacy __session_start__ sentinel with a natural-language greeting
913
+ # prompt so the LLM produces a real greeting instead of a system sentinel ("NO").
914
+ # user_message is kept as-is so the sentinel suppression logic still works.
915
+ if user_message == '__session_start__':
916
+ _face = identified_person or {}
917
+ _face_name = _face.get('name', '') if _face.get('name', '') != 'unknown' else ''
918
+ if _face_name:
919
+ _gateway_message = (
920
+ f'A new voice session has just started. The person in front of the camera '
921
+ f'has been identified as {_face_name}. Greet them by name — '
922
+ f'one brief, friendly sentence.'
923
+ )
924
+ else:
925
+ _gateway_message = (
926
+ 'A new voice session has just started. Give a brief, friendly one-sentence greeting. '
927
+ 'Do NOT address anyone by name — no face has been recognized and you do not know who is speaking.'
928
+ )
929
+ elif user_message.startswith('__suno_complete__:'):
930
+ _song_title = user_message[len('__suno_complete__:'):].strip() or 'your track'
931
+ _gateway_message = (
932
+ f'The Suno song "{_song_title}" just finished generating and is now ready in the music player. '
933
+ f'Let the user know in one brief, friendly sentence and offer to play it for them.'
934
+ )
935
+ else:
936
+ _gateway_message = user_message
937
+ message_with_context = context_prefix + _gateway_message if context_prefix else _gateway_message
938
+ ai_response = None
939
+ captured_actions = []
940
+
941
+ # ── PRIMARY PATH: Gateway (routed by gateway_id from request/profile) ──
942
+ if gateway_manager.is_configured():
943
+ try:
944
+ logger.info('### Starting Gateway connection...')
945
+ event_queue: queue.Queue = queue.Queue()
946
+ _session_key = get_voice_session_key()
947
+
948
+ # Check if gateway recently reconnected after a failure —
949
+ # inject a system note so the agent acknowledges the interruption
950
+ _recovery_prefix = ''
951
+ try:
952
+ _gw = gateway_manager.get(gateway_id)
953
+ if _gw and hasattr(_gw, 'consume_reconnection') and _gw.consume_reconnection():
954
+ _recovery_prefix = (
955
+ '[SYSTEM: The connection was briefly interrupted (server restart). '
956
+ 'Briefly acknowledge this to the user before responding to their message.]\n\n'
957
+ )
958
+ logger.info('### Injecting recovery prefix into message')
959
+ except Exception:
960
+ pass
961
+
962
+ def _run_gateway():
963
+ _msg = _recovery_prefix + message_with_context if _recovery_prefix else message_with_context
964
+ gateway_manager.stream_to_queue(
965
+ event_queue, _msg, _session_key, captured_actions,
966
+ gateway_id=gateway_id,
967
+ agent_id=agent_id,
968
+ )
969
+
970
+ t_llm_start = time.time()
971
+ gw_thread = threading.Thread(target=_run_gateway, daemon=True)
972
+ gw_thread.start()
973
+
974
+ if wants_stream:
975
+ # ── STREAMING MODE ────────────────────────────────────────
976
+ def stream_response():
977
+ nonlocal ai_response, event_queue, t_llm_start
978
+
979
+ # ── TTS helpers ───────────────────────────────────────
980
+ try:
981
+ _prov = get_provider(tts_provider)
982
+ _audio_fmt = _prov.get_info().get('audio_format', 'wav')
983
+ except Exception:
984
+ _audio_fmt = 'wav'
985
+
986
+ def _tts_error_event(err_str):
987
+ code_match = re.search(r'\[groq:([^\]]+)\]', err_str)
988
+ err_code = code_match.group(1) if code_match else 'unknown'
989
+ REASONS = {
990
+ 'model_terms_required': ('terms', 'Accept Orpheus terms at console.groq.com'),
991
+ 'rate_limit_exceeded': ('rate_limit', 'Groq rate limit hit — try again shortly'),
992
+ 'insufficient_quota': ('no_credits', 'Groq account out of credits'),
993
+ 'invalid_api_key': ('bad_key', 'Invalid GROQ_API_KEY'),
994
+ 'unknown': ('error', err_str),
995
+ }
996
+ reason_key, reason_msg = REASONS.get(err_code, ('error', err_str))
997
+ return json.dumps({
998
+ 'type': 'tts_error',
999
+ 'provider': tts_provider,
1000
+ 'reason': reason_key,
1001
+ 'error': reason_msg,
1002
+ }) + '\n'
1003
+
1004
+ # ── Mid-stream TTS helpers ────────────────────────────
1005
+ def _has_open_tag(text):
1006
+ """True while inside an incomplete [...] action tag or open code fence."""
1007
+ if text.count('[') > text.count(']'):
1008
+ return True
1009
+ # Odd number of ``` markers means we're inside a code block
1010
+ if text.count('```') % 2 != 0:
1011
+ return True
1012
+ return False
1013
+
1014
+ def _extract_sentence(text, min_len=40):
1015
+ """Return (sentence, remainder) at first sentence boundary
1016
+ that falls at or after min_len chars. Skips boundaries that
1017
+ are likely inside abbreviations (e.g. A.I., Mr.)."""
1018
+ if len(text) < min_len:
1019
+ return None, text
1020
+ for match in re.finditer(r'[.!?](?= |\Z)', text):
1021
+ end = match.end()
1022
+ if end >= min_len:
1023
+ return text[:end].strip(), text[end:].lstrip()
1024
+ return None, text
1025
+
1026
+ def _fire_tts(raw_text):
1027
+ """Start TTS for raw_text in background. Returns (done_event, result)."""
1028
+ done = threading.Event()
1029
+ result = {'audio': None, 'error': None}
1030
+ def _run():
1031
+ try:
1032
+ t0 = time.time()
1033
+ cleaned = clean_for_tts(raw_text)
1034
+ t_clean = time.time()
1035
+ if cleaned and cleaned.strip():
1036
+ result['audio'] = _tts_generate_b64(
1037
+ cleaned, voice=voice or 'M1',
1038
+ tts_provider=tts_provider
1039
+ )
1040
+ t_done = time.time()
1041
+ logger.info(
1042
+ f"### TTS TIMING: clean={int((t_clean-t0)*1000)}ms "
1043
+ f"generate={int((t_done-t_clean)*1000)}ms "
1044
+ f"total={int((t_done-t0)*1000)}ms "
1045
+ f"text={len(cleaned or '')} chars"
1046
+ )
1047
+ except Exception as e:
1048
+ result['error'] = str(e)
1049
+ finally:
1050
+ done.set()
1051
+ threading.Thread(target=_run, daemon=True).start()
1052
+ return done, result
1053
+
1054
+ # Mid-stream TTS state
1055
+ _tts_buf = '' # raw incremental text buffer
1056
+ _tts_pending = [] # [(done_event, result_dict), ...]
1057
+ _chunks_sent = 0 # audio chunks already yielded early
1058
+
1059
+ full_response = None
1060
+ _stream_start = time.time()
1061
+ _STREAM_HARD_TIMEOUT = 310 # seconds — total allowed time
1062
+ _QUEUE_POLL_INTERVAL = 10 # seconds — yield heartbeat if no events
1063
+ while True:
1064
+ try:
1065
+ evt = event_queue.get(timeout=_QUEUE_POLL_INTERVAL)
1066
+ except queue.Empty:
1067
+ # No events for _QUEUE_POLL_INTERVAL seconds.
1068
+ # Yield a heartbeat to keep the browser/Cloudflare
1069
+ # connection alive (they time out at 60-100s of silence).
1070
+ elapsed = int(time.time() - _stream_start)
1071
+ if elapsed > _STREAM_HARD_TIMEOUT:
1072
+ yield json.dumps({'type': 'error', 'error': 'Gateway timeout'}) + '\n'
1073
+ break
1074
+ yield json.dumps({'type': 'heartbeat', 'elapsed': elapsed}) + '\n'
1075
+ continue
1076
+
1077
+ if evt['type'] == 'handshake':
1078
+ metrics['handshake_ms'] = evt['ms']
1079
+ continue
1080
+
1081
+ if evt['type'] == 'heartbeat':
1082
+ logger.info(f"### HEARTBEAT → browser ({evt.get('elapsed', 0)}s)")
1083
+ yield json.dumps({'type': 'heartbeat', 'elapsed': evt.get('elapsed', 0)}) + '\n'
1084
+ # Flush any TTS that finished during tool execution —
1085
+ # without this, audio sits in _tts_pending for the
1086
+ # entire duration of tool calls (30-60s+ silence).
1087
+ while _tts_pending and _tts_pending[0][0].is_set():
1088
+ _done_evt, _res = _tts_pending.pop(0)
1089
+ if _res.get('error'):
1090
+ yield _tts_error_event(_res['error'])
1091
+ elif _res.get('audio'):
1092
+ yield json.dumps({
1093
+ 'type': 'audio',
1094
+ 'audio': _res['audio'],
1095
+ 'audio_format': _audio_fmt,
1096
+ 'chunk': _chunks_sent,
1097
+ 'total_chunks': None,
1098
+ 'timing': {
1099
+ 'tts_ms': 0,
1100
+ 'total_ms': int((time.time() - t_request_start) * 1000),
1101
+ },
1102
+ }) + '\n'
1103
+ _chunks_sent += 1
1104
+ continue
1105
+
1106
+ if evt['type'] == 'delta':
1107
+ _tts_buf += evt['text']
1108
+ # Don't fire TTS if buffer looks like a system response
1109
+ # that will be suppressed at text_done. Wait for final
1110
+ # confirmation before speaking.
1111
+ _buf_stripped = _tts_buf.strip()
1112
+ # Suppress system responses — uses regex from compat layer
1113
+ # plus partial match for mid-stream detection
1114
+ _is_system_text = (
1115
+ is_system_response(_buf_stripped)
1116
+ or _buf_stripped.upper().startswith('HEARTBEAT')
1117
+ )
1118
+ # Fire TTS for complete sentences as they arrive
1119
+ if not _is_system_text and not _has_open_tag(_tts_buf):
1120
+ sentence, _tts_buf = _extract_sentence(_tts_buf, min_len=_min_sentence_chars)
1121
+ if sentence:
1122
+ logger.info(f"### TTS sentence (streaming): {sentence[:80]}")
1123
+ _tts_pending.append(_fire_tts(sentence))
1124
+ yield json.dumps({'type': 'delta', 'text': evt['text']}) + '\n'
1125
+ # Flush any TTS chunks that finished while text was streaming —
1126
+ # play audio as soon as it's ready instead of waiting for text_done
1127
+ while _tts_pending and _tts_pending[0][0].is_set():
1128
+ _done_evt, _res = _tts_pending.pop(0)
1129
+ if _res.get('error'):
1130
+ yield _tts_error_event(_res['error'])
1131
+ elif _res.get('audio'):
1132
+ yield json.dumps({
1133
+ 'type': 'audio',
1134
+ 'audio': _res['audio'],
1135
+ 'audio_format': _audio_fmt,
1136
+ 'chunk': _chunks_sent,
1137
+ 'total_chunks': None,
1138
+ 'timing': {
1139
+ 'tts_ms': 0,
1140
+ 'total_ms': int((time.time() - t_request_start) * 1000),
1141
+ },
1142
+ }) + '\n'
1143
+ _chunks_sent += 1
1144
+ continue
1145
+
1146
+ if evt['type'] == 'action':
1147
+ # Flush any TTS chunks that already finished —
1148
+ # avoids silence during long tool calls (the first
1149
+ # sentence TTS completes ~1s in but would otherwise
1150
+ # wait until text_done which can be minutes away).
1151
+ while _tts_pending and _tts_pending[0][0].is_set():
1152
+ _done_evt, _res = _tts_pending.pop(0)
1153
+ if _res.get('error'):
1154
+ yield _tts_error_event(_res['error'])
1155
+ elif _res.get('audio'):
1156
+ yield json.dumps({
1157
+ 'type': 'audio',
1158
+ 'audio': _res['audio'],
1159
+ 'audio_format': _audio_fmt,
1160
+ 'chunk': _chunks_sent,
1161
+ 'total_chunks': None,
1162
+ 'timing': {
1163
+ 'tts_ms': 0,
1164
+ 'total_ms': int((time.time() - t_request_start) * 1000),
1165
+ },
1166
+ }) + '\n'
1167
+ _chunks_sent += 1
1168
+ yield json.dumps({'type': 'action', 'action': evt['action']}) + '\n'
1169
+ continue
1170
+
1171
+ if evt['type'] == 'queued':
1172
+ StatusModule_hack = True # just yield to browser
1173
+ yield json.dumps({'type': 'queued'}) + '\n'
1174
+ continue
1175
+
1176
+ if evt['type'] == 'text_done':
1177
+ logger.info(f"### TEXT_DONE received. response={len(evt.get('response', '') or '')} chars, _tts_pending={len(_tts_pending)}, _tts_buf={repr(_tts_buf[:80])}")
1178
+ # Handle LLM/gateway errors with a spoken fallback
1179
+ if evt.get('error') and not evt.get('response'):
1180
+ error_msg = evt['error']
1181
+ logger.error(f"### GATEWAY ERROR → fallback: {error_msg}")
1182
+ evt['response'] = "One moment, still working on that."
1183
+ metrics['fallback_used'] = 1
1184
+ full_response = evt.get('response')
1185
+ if full_response and max_response_chars:
1186
+ full_response = _truncate_at_sentence(full_response, max_response_chars)
1187
+
1188
+ # Suppress bare NO/YES sentinel responses to system triggers
1189
+ # (gateway returns "NO" for wake-word checks on __session_start__)
1190
+ _is_system_trigger = user_message.startswith('__')
1191
+ if _is_system_trigger and full_response and \
1192
+ full_response.strip().upper() in ('NO', 'NO.', 'YES', 'YES.'):
1193
+ logger.info(f'Suppressing sentinel "{full_response.strip()}" for system trigger')
1194
+ yield json.dumps({'type': 'no_audio'}) + '\n'
1195
+ log_metrics(metrics)
1196
+ break
1197
+
1198
+ # Tag-only response fallback: if the agent responded
1199
+ # with ONLY action tags and no spoken words, prepend
1200
+ # a brief acknowledgment so TTS has something to say.
1201
+ if full_response and re.match(
1202
+ r'^\s*(\[[^\]]+\]\s*)+$', full_response
1203
+ ):
1204
+ logger.info(
1205
+ f"### Tag-only response detected, prepending "
1206
+ f"spoken text: {full_response.strip()[:60]}"
1207
+ )
1208
+ full_response = "Here you go. " + full_response
1209
+
1210
+ metrics['llm_inference_ms'] = int((time.time() - t_llm_start) * 1000)
1211
+ metrics['tool_count'] = sum(
1212
+ 1 for a in captured_actions
1213
+ if a.get('type') == 'tool' and a.get('phase') == 'start'
1214
+ )
1215
+ metrics['profile'] = 'gateway'
1216
+ metrics['model'] = 'glm-4.7-flash'
1217
+ logger.debug(f"[GW] Gateway response ({len(full_response or '')} chars): {repr((full_response or '')[:300])}")
1218
+ logger.info(
1219
+ f"### LLM inference completed in "
1220
+ f"{metrics['llm_inference_ms']}ms "
1221
+ f"(tools={metrics['tool_count']})"
1222
+ )
1223
+
1224
+ # ── Clear recovery mode on successful gateway response ──
1225
+ if full_response and full_response.strip() and _session_recovery_key is not None:
1226
+ _exit_session_recovery()
1227
+
1228
+ # ── Retry once on instant empty response ──
1229
+ # IMPORTANT: check BEFORE yielding text_done.
1230
+ # If we yield empty text_done first, the client
1231
+ # shows "Sorry" and cancels its reader — the retry
1232
+ # result never reaches it.
1233
+ # Instead: yield {'type':'retrying'} to keep the
1234
+ # client alive, then swap the event queue.
1235
+ _is_empty = not full_response or not full_response.strip()
1236
+ if _is_empty and metrics.get('llm_inference_ms', 9999) < 5000 \
1237
+ and not getattr(stream_response, '_retried', False):
1238
+ stream_response._retried = True
1239
+ logger.warning(
1240
+ f"### EMPTY RESPONSE in {metrics['llm_inference_ms']}ms "
1241
+ f"— retrying once (client kept alive via 'retrying' event)"
1242
+ )
1243
+ # Tell the client to wait — don't show fallback
1244
+ yield json.dumps({'type': 'retrying'}) + '\n'
1245
+ time.sleep(2)
1246
+ # Re-send the same message through the gateway on the same key.
1247
+ # Openclaw removed the orphaned message on the first attempt.
1248
+ # If this is session_start, also clear the session file to eliminate
1249
+ # any further stale state before the retry.
1250
+ # Note: session file clearing moved to host watchdog
1251
+ # (session files are inside openclaw container, not accessible from here)
1252
+ retry_queue = queue.Queue()
1253
+ captured_actions.clear()
1254
+ def _retry_gateway():
1255
+ gateway_manager.stream_to_queue(
1256
+ retry_queue, message_with_context,
1257
+ _session_key, captured_actions,
1258
+ gateway_id=gateway_id,
1259
+ agent_id=agent_id,
1260
+ )
1261
+ retry_thread = threading.Thread(
1262
+ target=_retry_gateway, daemon=True
1263
+ )
1264
+ t_llm_start = time.time()
1265
+ retry_thread.start()
1266
+ event_queue = retry_queue
1267
+ logger.info("### RETRY: re-sent message to gateway")
1268
+ continue # back to event loop — text_done NOT sent yet
1269
+
1270
+ # ── Z.AI direct fallback after double-empty ──
1271
+ if _is_empty and getattr(stream_response, '_retried', False):
1272
+ logger.warning('### DOUBLE EMPTY — session poisoned, entering recovery mode')
1273
+
1274
+ # 1. Switch to recovery session key so NEXT request
1275
+ # goes to a fresh openclaw session (not the poisoned one)
1276
+ _enter_session_recovery()
1277
+
1278
+ # 2. Force-disconnect gateway WS so it reconnects fresh
1279
+ try:
1280
+ _gw = gateway_manager.get(gateway_id)
1281
+ if _gw and hasattr(_gw, 'force_disconnect'):
1282
+ _gw.force_disconnect()
1283
+ logger.warning('### Force-disconnected gateway WS after double-empty')
1284
+ except Exception as _dfe:
1285
+ logger.error(f'### Failed to disconnect gateway: {_dfe}')
1286
+
1287
+ # 3. Write restart flag for host watchdog (background cleanup)
1288
+ try:
1289
+ _flag_path = Path('/app/runtime/uploads/.restart-openclaw.flag')
1290
+ _flag_path.write_text(
1291
+ f'double-empty at {__import__("datetime").datetime.utcnow().isoformat()}Z'
1292
+ )
1293
+ logger.warning('### Wrote .restart-openclaw.flag — watchdog will clean up poisoned session')
1294
+ except Exception as _rfe:
1295
+ logger.error(f'### Failed to write restart flag: {_rfe}')
1296
+
1297
+ # 4. Try Z.AI direct fallback for THIS message
1298
+ try:
1299
+ import requests as _req
1300
+ _zai_key = os.environ.get('ZAI_API_KEY', '')
1301
+ if _zai_key:
1302
+ _zai_resp = _req.post(
1303
+ 'https://api.z.ai/api/anthropic/v1/messages',
1304
+ headers={
1305
+ 'x-api-key': _zai_key,
1306
+ 'anthropic-version': '2023-06-01',
1307
+ 'content-type': 'application/json',
1308
+ },
1309
+ json={
1310
+ 'model': 'glm-4.7',
1311
+ 'max_tokens': 400,
1312
+ 'messages': [{'role': 'user', 'content': message_with_context}],
1313
+ },
1314
+ timeout=20,
1315
+ )
1316
+ if _zai_resp.status_code == 200:
1317
+ _zai_data = _zai_resp.json()
1318
+ _zai_text = _zai_data.get('content', [{}])[0].get('text', '')
1319
+ if _zai_text:
1320
+ full_response = _zai_text
1321
+ metrics['fallback_used'] = 1
1322
+ metrics['profile'] = 'zai-direct'
1323
+ logger.info(f'### Z.AI direct fallback succeeded: {len(_zai_text)} chars')
1324
+ except Exception as _zfe:
1325
+ logger.error(f'### Z.AI direct fallback failed: {_zfe}')
1326
+
1327
+ if not full_response or not full_response.strip():
1328
+ full_response = "I had a brief connection issue. I'm reconnecting now — please try again."
1329
+
1330
+ yield json.dumps({
1331
+ 'type': 'text_done',
1332
+ 'response': full_response,
1333
+ 'actions': captured_actions,
1334
+ 'timing': {
1335
+ 'handshake_ms': metrics.get('handshake_ms'),
1336
+ 'llm_ms': metrics.get('llm_inference_ms'),
1337
+ }
1338
+ }) + '\n'
1339
+
1340
+ # Auto-reset removed — loop detection (Phase 1 config)
1341
+ # handles stuck agents; consecutive empties no longer
1342
+ # trigger a session key bump that would cold-cache Z.AI.
1343
+
1344
+ # Handle [SESSION_RESET] trigger from agent
1345
+ if full_response and '[SESSION_RESET]' in full_response:
1346
+ old_key = get_voice_session_key()
1347
+ new_key = bump_voice_session()
1348
+ logger.info(
1349
+ f'### AGENT-TRIGGERED SESSION RESET: {old_key} → {new_key}'
1350
+ )
1351
+ full_response = full_response.replace('[SESSION_RESET]', '').strip()
1352
+
1353
+ # Detect agent returning a bare file path (e.g. from TTS tool use)
1354
+ if full_response and re.match(r'^/tmp/[\w/.-]+$', full_response.strip()):
1355
+ file_path = full_response.strip()
1356
+ logger.warning(f'Agent returned file path — serving directly: {file_path}')
1357
+ try:
1358
+ with open(file_path, 'rb') as f:
1359
+ file_bytes = f.read()
1360
+ audio_b64 = base64.b64encode(file_bytes).decode('utf-8')
1361
+ ext = file_path.rsplit('.', 1)[-1].lower()
1362
+ audio_format = ext if ext in ('mp3', 'wav', 'ogg') else 'mp3'
1363
+ metrics['tts_generation_ms'] = 0
1364
+ metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
1365
+ yield json.dumps({
1366
+ 'type': 'audio',
1367
+ 'audio': audio_b64,
1368
+ 'audio_format': audio_format,
1369
+ 'chunk': 0,
1370
+ 'timing': {'tts_ms': 0, 'total_ms': metrics.get('total_ms')},
1371
+ }) + '\n'
1372
+ logger.info(f'Served agent-generated audio: {len(file_bytes)} bytes ({audio_format})')
1373
+ except Exception as fp_err:
1374
+ logger.error(f'Failed to serve agent audio file {file_path}: {fp_err}')
1375
+ yield json.dumps({
1376
+ 'type': 'tts_error',
1377
+ 'provider': 'agent',
1378
+ 'reason': 'file_read_error',
1379
+ 'error': f'Agent generated audio but file could not be read: {fp_err}',
1380
+ }) + '\n'
1381
+ log_metrics(metrics)
1382
+ break
1383
+
1384
+ # ── Flush TTS buffer + yield audio chunks in order ──
1385
+ metrics['response_len'] = len(full_response) if full_response else 0
1386
+
1387
+ # If response was suppressed (None), discard ALL
1388
+ # pending TTS — never speak suppressed text like
1389
+ # HEARTBEAT_OK that leaked through delta streaming.
1390
+ if not full_response:
1391
+ if _tts_pending:
1392
+ logger.info(
1393
+ f"### Discarding {len(_tts_pending)} TTS "
1394
+ f"chunks for suppressed response"
1395
+ )
1396
+ _tts_buf = ''
1397
+ _tts_pending = []
1398
+
1399
+ # Fire TTS for any remaining buffered text
1400
+ _remaining = _tts_buf.strip()
1401
+ if _remaining:
1402
+ _tts_pending.append(_fire_tts(_remaining))
1403
+ _tts_buf = ''
1404
+
1405
+ # Fallback: no sentences extracted (very short response)
1406
+ if not _tts_pending and full_response:
1407
+ tts_text = clean_for_tts(full_response)
1408
+ if tts_text and tts_text.strip():
1409
+ _tts_pending.append(_fire_tts(tts_text))
1410
+
1411
+ if not _tts_pending:
1412
+ logger.info('Skipping TTS — no speakable text')
1413
+ # Tell the frontend there's no audio coming so it can
1414
+ # reset isProcessing and re-enable the mic.
1415
+ yield json.dumps({'type': 'no_audio'}) + '\n'
1416
+ metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
1417
+ log_metrics(metrics)
1418
+ if full_response:
1419
+ log_conversation('assistant', full_response,
1420
+ session_id=session_id,
1421
+ tts_provider=tts_provider, voice=voice)
1422
+ save_conversation_turn(
1423
+ user_msg=user_message,
1424
+ ai_response=full_response,
1425
+ session_id=session_id,
1426
+ session_key=_session_key,
1427
+ tts_provider=tts_provider,
1428
+ voice=voice,
1429
+ duration_ms=metrics.get('total_ms'),
1430
+ actions=captured_actions,
1431
+ identified_person=identified_person,
1432
+ )
1433
+ break
1434
+
1435
+ t_tts_start = time.time()
1436
+ total_chunks = _chunks_sent + len(_tts_pending)
1437
+ tts_ok = True
1438
+ for i, (done_evt, res) in enumerate(_tts_pending):
1439
+ done_evt.wait(timeout=30)
1440
+ if res['error']:
1441
+ metrics['tts_success'] = 0
1442
+ metrics['tts_error'] = res['error']
1443
+ yield _tts_error_event(res['error'])
1444
+ tts_ok = False
1445
+ break
1446
+ if res['audio']:
1447
+ yield json.dumps({
1448
+ 'type': 'audio',
1449
+ 'audio': res['audio'],
1450
+ 'audio_format': _audio_fmt,
1451
+ 'chunk': _chunks_sent + i,
1452
+ 'total_chunks': total_chunks,
1453
+ 'timing': {
1454
+ 'tts_ms': int((time.time() - t_tts_start) * 1000),
1455
+ 'total_ms': int((time.time() - t_request_start) * 1000),
1456
+ },
1457
+ }) + '\n'
1458
+
1459
+ metrics['tts_generation_ms'] = int((time.time() - t_tts_start) * 1000)
1460
+ metrics['tts_text_len'] = metrics['response_len']
1461
+ metrics['total_ms'] = int((time.time() - t_request_start) * 1000)
1462
+ log_metrics(metrics)
1463
+ if full_response:
1464
+ log_conversation('assistant', full_response,
1465
+ session_id=session_id,
1466
+ tts_provider=tts_provider, voice=voice)
1467
+ save_conversation_turn(
1468
+ user_msg=user_message,
1469
+ ai_response=full_response,
1470
+ session_id=session_id,
1471
+ session_key=_session_key,
1472
+ tts_provider=tts_provider,
1473
+ voice=voice,
1474
+ duration_ms=metrics.get('total_ms'),
1475
+ actions=captured_actions,
1476
+ identified_person=identified_person,
1477
+ )
1478
+ break
1479
+
1480
+ if evt['type'] == 'error':
1481
+ yield json.dumps({
1482
+ 'type': 'error',
1483
+ 'error': evt.get('error', 'Unknown error')
1484
+ }) + '\n'
1485
+ break
1486
+
1487
+ # Drain any unprocessed events (debug: detect generator exit without text_done)
1488
+ _remaining_evts = []
1489
+ while not event_queue.empty():
1490
+ try:
1491
+ _remaining_evts.append(event_queue.get_nowait())
1492
+ except Exception:
1493
+ break
1494
+ if _remaining_evts:
1495
+ _types = [e.get('type', '?') for e in _remaining_evts]
1496
+ logger.warning(f"### STREAM EXIT with {len(_remaining_evts)} unprocessed events: {_types}")
1497
+
1498
+ return Response(
1499
+ stream_response(),
1500
+ mimetype='application/x-ndjson',
1501
+ headers={'X-Accel-Buffering': 'no', 'Cache-Control': 'no-cache'}
1502
+ )
1503
+
1504
+ else:
1505
+ # ── NON-STREAMING: wait for full Gateway response ─────────
1506
+ gw_thread.join(timeout=310)
1507
+ while not event_queue.empty():
1508
+ evt = event_queue.get_nowait()
1509
+ if evt['type'] == 'text_done':
1510
+ ai_response = evt.get('response')
1511
+ elif evt['type'] == 'handshake':
1512
+ metrics['handshake_ms'] = evt['ms']
1513
+ metrics['llm_inference_ms'] = int((time.time() - t_llm_start) * 1000)
1514
+ metrics['tool_count'] = sum(
1515
+ 1 for a in captured_actions
1516
+ if a.get('type') == 'tool' and a.get('phase') == 'start'
1517
+ )
1518
+ metrics['profile'] = 'gateway'
1519
+ metrics['model'] = 'glm-4.7-flash'
1520
+ logger.info(
1521
+ f"### LLM inference completed in {metrics['llm_inference_ms']}ms "
1522
+ f"(tools={metrics['tool_count']})"
1523
+ )
1524
+
1525
+ except Exception as e:
1526
+ logger.error(f'Failed to call Clawdbot Gateway: {e}')
1527
+
1528
+ # ── FALLBACK: Z.AI direct (glm-4.5-flash, no tools) ──────────────────
1529
+ if not ai_response:
1530
+ if metrics.get('profile') == 'gateway':
1531
+ logger.warning('No text response from Gateway, falling back to Z.AI flash...')
1532
+ metrics['fallback_used'] = 1
1533
+ else:
1534
+ logger.info('Using Z.AI flash direct (primary path)')
1535
+ t_flash_start = time.time()
1536
+ # Lazy import to avoid circular dependency (server.py imports this blueprint)
1537
+ try:
1538
+ import server as _server
1539
+ ai_response = _server.get_zai_direct_response(message_with_context, session_id)
1540
+ except Exception as e:
1541
+ logger.error(f'Z.AI direct call failed: {e}')
1542
+ ai_response = None
1543
+ metrics['profile'] = 'flash-direct'
1544
+ metrics['model'] = 'glm-4.5-flash'
1545
+ metrics['llm_inference_ms'] = int((time.time() - t_flash_start) * 1000)
1546
+
1547
+ # ── LAST RESORT ───────────────────────────────────────────────────────
1548
+ if not ai_response:
1549
+ logger.warning('Both Gateway and Z.AI flash failed, using generic fallback')
1550
+ ai_response = "One moment, I'm still working on something."
1551
+
1552
+ # Clean text for TTS
1553
+ tts_text = clean_for_tts(ai_response)
1554
+ logger.info(f'Cleaned TTS text ({len(tts_text)} chars): {tts_text[:100]}...')
1555
+ metrics['response_len'] = len(ai_response) if ai_response else 0
1556
+ metrics['tts_text_len'] = len(tts_text)
1557
+
1558
+ # Generate TTS audio
1559
+ t_tts_start = time.time()
1560
+ audio_base64 = None
1561
+ if tts_text and tts_text.strip():
1562
+ audio_base64 = _tts_generate_b64(tts_text, voice=voice or 'M1',
1563
+ tts_provider=tts_provider)
1564
+ if audio_base64 is None:
1565
+ metrics['tts_success'] = 0
1566
+ metrics['tts_error'] = 'TTS generation failed'
1567
+ t_tts_end = time.time()
1568
+ metrics['tts_generation_ms'] = int((t_tts_end - t_tts_start) * 1000)
1569
+ metrics['total_ms'] = int((t_tts_end - t_request_start) * 1000)
1570
+
1571
+ log_metrics(metrics)
1572
+ if ai_response:
1573
+ log_conversation('assistant', ai_response, session_id=session_id,
1574
+ tts_provider=tts_provider, voice=voice)
1575
+ save_conversation_turn(
1576
+ user_msg=user_message,
1577
+ ai_response=ai_response,
1578
+ session_id=session_id,
1579
+ session_key=get_voice_session_key(),
1580
+ tts_provider=tts_provider,
1581
+ voice=voice,
1582
+ duration_ms=metrics.get('total_ms'),
1583
+ actions=captured_actions,
1584
+ identified_person=identified_person,
1585
+ )
1586
+
1587
+ response_data = {'response': ai_response, 'user_said': user_message}
1588
+ if audio_base64:
1589
+ response_data['audio'] = audio_base64
1590
+ if captured_actions:
1591
+ response_data['actions'] = captured_actions
1592
+ response_data['timing'] = {
1593
+ 'handshake_ms': metrics.get('handshake_ms'),
1594
+ 'llm_ms': metrics.get('llm_inference_ms'),
1595
+ 'tts_ms': metrics.get('tts_generation_ms'),
1596
+ 'total_ms': metrics.get('total_ms'),
1597
+ }
1598
+
1599
+ return jsonify(response_data)
1600
+
1601
+ # ---------------------------------------------------------------------------
1602
+ # POST /api/conversation/abort
1603
+ # ---------------------------------------------------------------------------
1604
+
1605
+
1606
+ @conversation_bp.route('/api/conversation/abort', methods=['POST'])
1607
+ def conversation_abort():
1608
+ """Abort the active agent run for the current voice session.
1609
+
1610
+ Fire-and-forget from client — used by PTT interrupt and sendMessage
1611
+ interrupt to tell openclaw to stop generating so it doesn't waste compute.
1612
+ """
1613
+ session_key = get_voice_session_key()
1614
+ # Log abort source from client for debugging
1615
+ source = 'unknown'
1616
+ source_text = ''
1617
+ try:
1618
+ body = request.get_json(silent=True) or {}
1619
+ source = body.get('source', 'unknown')
1620
+ source_text = body.get('text', '')
1621
+ except Exception:
1622
+ pass
1623
+ gw = gateway_manager.get('openclaw')
1624
+ aborted = False
1625
+ if gw and hasattr(gw, 'abort_active_run'):
1626
+ aborted = gw.abort_active_run(session_key)
1627
+ logger.info(f"### ABORT request session={session_key} aborted={aborted} source={source} text={source_text!r}")
1628
+ return jsonify({'ok': True, 'aborted': aborted})
1629
+
1630
+
1631
+ # ---------------------------------------------------------------------------
1632
+ # POST /api/conversation/steer
1633
+ # ---------------------------------------------------------------------------
1634
+
1635
+
1636
+ @conversation_bp.route('/api/conversation/steer', methods=['POST'])
1637
+ def conversation_steer():
1638
+ """Inject a user message into the active agent run (steer mode).
1639
+
1640
+ Fire-and-forget from client — used when the user speaks while the
1641
+ agent is silently working (tools / sub-agents / heartbeat). Instead
1642
+ of aborting the active run and starting fresh, this sends a second
1643
+ chat.send to the same session. OpenClaw's messages.queue.mode=steer
1644
+ injects the message at the next tool boundary so the agent sees the
1645
+ user's correction and pivots immediately.
1646
+
1647
+ The active /api/conversation streaming response continues receiving
1648
+ the steered output — no new streaming connection is needed.
1649
+
1650
+ Request body:
1651
+ message (str) — the user's text to inject
1652
+ source (str) — label for logging (e.g. 'clawdbot-sendMessage')
1653
+
1654
+ Returns:
1655
+ { ok: true, steered: true/false }
1656
+ """
1657
+ body = request.get_json(silent=True) or {}
1658
+ message = (body.get('message') or '').strip()
1659
+ source = body.get('source', 'unknown')
1660
+
1661
+ if not message:
1662
+ return jsonify({'ok': False, 'error': 'No message provided'}), 400
1663
+
1664
+ # Input length guard (same as main conversation endpoint)
1665
+ if len(message) > 4000:
1666
+ return jsonify({'ok': False, 'error': 'Message too long'}), 400
1667
+
1668
+ session_key = get_voice_session_key()
1669
+
1670
+ steered = gateway_manager.send_steer(message, session_key)
1671
+
1672
+ logger.info(
1673
+ f"### STEER request session={session_key} steered={steered} "
1674
+ f"source={source} text={message!r}"
1675
+ )
1676
+
1677
+ # Log the steer message as a user turn so the transcript is preserved
1678
+ log_conversation('user', message, session_id='default')
1679
+
1680
+ return jsonify({'ok': True, 'steered': steered})
1681
+
1682
+
1683
+ # ---------------------------------------------------------------------------
1684
+ # POST /api/conversation/reset
1685
+ # ---------------------------------------------------------------------------
1686
+
1687
+
1688
+ @conversation_bp.route('/api/conversation/reset', methods=['POST'])
1689
+ def conversation_reset():
1690
+ """Clear in-process conversation history for a session."""
1691
+ body = request.get_json() or {}
1692
+ session_id = body.get('session_id', 'default')
1693
+ conversation_histories.pop(session_id, None)
1694
+ return jsonify({'status': 'ok', 'message': 'Conversation history cleared'})
1695
+
1696
+
1697
+ # ---------------------------------------------------------------------------
1698
+ # POST /api/session/reset — manual session reset from UI actions panel
1699
+ # ---------------------------------------------------------------------------
1700
+
1701
+ @conversation_bp.route('/api/session/reset', methods=['POST'])
1702
+ def session_reset():
1703
+ """Clear the corrupted openclaw session state and return a fresh session key.
1704
+ Called by the Reset button in the UI actions panel.
1705
+ Clears the openclaw session JSONL file so orphaned messages don't cascade,
1706
+ then bumps the voice session key so the next request starts completely fresh."""
1707
+ old_key = get_voice_session_key()
1708
+ # Find and clear the openclaw session file for the current session key
1709
+ try:
1710
+ sessions_dir = Path('/home/node/.openclaw/agents/openvoiceui/sessions')
1711
+ sessions_json = sessions_dir / 'sessions.json'
1712
+ if sessions_json.exists():
1713
+ import json as _json
1714
+ sessions_map = _json.loads(sessions_json.read_text())
1715
+ # The openclaw session key format is "agent:openvoiceui:<voice_key>"
1716
+ oclaw_key = f'agent:openvoiceui:{old_key}'
1717
+ session_info = sessions_map.get(oclaw_key, {})
1718
+ session_id = session_info.get('sessionId')
1719
+ if session_id:
1720
+ session_file = sessions_dir / f'{session_id}.jsonl'
1721
+ if session_file.exists():
1722
+ _ts = __import__('datetime').datetime.utcnow().isoformat() + 'Z'
1723
+ session_file.write_text('{"type":"session","version":3,"id":"' + session_id + '","timestamp":"' + _ts + '","cwd":"/home/node/.openclaw/workspace"}\n')
1724
+ logger.info(f'### SESSION RESET: cleared openclaw session file {session_id}.jsonl')
1725
+ except Exception as e:
1726
+ logger.warning(f'### SESSION RESET: could not clear openclaw session file: {e}')
1727
+ new_key = bump_voice_session()
1728
+ return jsonify({'status': 'ok', 'old': old_key, 'new': new_key})
1729
+
1730
+
1731
+ # ---------------------------------------------------------------------------
1732
+ # GET /api/tts/providers
1733
+ # ---------------------------------------------------------------------------
1734
+
1735
+
1736
+ @conversation_bp.route('/api/tts/providers', methods=['GET'])
1737
+ def tts_providers_list():
1738
+ """List all available TTS providers with metadata."""
1739
+ try:
1740
+ providers = list_providers(include_inactive=True)
1741
+ config_path = (Path(__file__).parent.parent
1742
+ / 'tts_providers' / 'providers_config.json')
1743
+ default_provider = 'supertonic'
1744
+ try:
1745
+ with open(config_path, 'r') as f:
1746
+ config = json.load(f)
1747
+ default_provider = config.get('default_provider', 'supertonic')
1748
+ except Exception:
1749
+ pass
1750
+ return jsonify({'providers': providers, 'default_provider': default_provider})
1751
+ except Exception as e:
1752
+ logger.error(f'Failed to list TTS providers: {e}')
1753
+ return jsonify({'error': f'Failed to list providers: {e}'}), 500
1754
+
1755
+ # ---------------------------------------------------------------------------
1756
+ # POST /api/tts/generate
1757
+ # ---------------------------------------------------------------------------
1758
+
1759
+
1760
+ @conversation_bp.route('/api/tts/generate', methods=['POST'])
1761
+ def tts_generate():
1762
+ """
1763
+ Generate speech from text using the specified TTS provider.
1764
+
1765
+ Request JSON:
1766
+ text : str — text to synthesize (required)
1767
+ provider : str — provider ID (default: supertonic)
1768
+ voice : str — voice ID (default: provider default)
1769
+ lang : str — language code (default: en)
1770
+ speed : float — speech speed (default: provider default)
1771
+ options : dict — provider-specific options
1772
+ Returns: WAV audio file
1773
+ """
1774
+ try:
1775
+ data = request.get_json()
1776
+ if not data:
1777
+ return jsonify({'error': 'No JSON data provided'}), 400
1778
+
1779
+ text = data.get('text', '').strip()
1780
+ if not text:
1781
+ return jsonify({'error': 'Text cannot be empty'}), 400
1782
+
1783
+ # Length guard (P7-T3 security audit)
1784
+ if len(text) > 2000:
1785
+ return jsonify({'error': 'Text too long (max 2000 characters)'}), 400
1786
+
1787
+ provider_id = data.get('provider', 'supertonic')
1788
+ voice = data.get('voice', None)
1789
+ lang = data.get('lang', 'en')
1790
+ speed = data.get('speed', None)
1791
+ options = data.get('options', {})
1792
+
1793
+ valid_langs = ['en', 'ko', 'es', 'pt', 'fr', 'zh', 'ja', 'de']
1794
+ if lang and lang.lower() not in valid_langs:
1795
+ return jsonify({
1796
+ 'error': f"Invalid language: {lang}. Supported: {', '.join(valid_langs)}"
1797
+ }), 400
1798
+
1799
+ if speed is not None:
1800
+ try:
1801
+ speed = float(speed)
1802
+ if speed < 0.25 or speed > 4.0:
1803
+ return jsonify({'error': 'Speed must be between 0.25 and 4.0'}), 400
1804
+ except (ValueError, TypeError):
1805
+ return jsonify({'error': 'Speed must be a valid number'}), 400
1806
+
1807
+ try:
1808
+ provider = get_provider(provider_id)
1809
+ except ValueError as e:
1810
+ available = ', '.join([p['provider_id'] for p in list_providers()])
1811
+ return jsonify({'error': 'Invalid TTS provider', 'available_providers': available}), 400
1812
+
1813
+ logger.info(
1814
+ f"TTS request: provider={provider_id}, text='{text[:50]}...', "
1815
+ f"voice={voice}, lang={lang}, speed={speed}"
1816
+ )
1817
+
1818
+ gen_params = {'text': text}
1819
+ if voice is not None:
1820
+ gen_params['voice'] = voice
1821
+ if lang is not None:
1822
+ gen_params['lang'] = lang
1823
+ if speed is not None:
1824
+ gen_params['speed'] = speed
1825
+ gen_params.update(options)
1826
+
1827
+ try:
1828
+ audio_bytes = provider.generate_speech(**gen_params)
1829
+ except ValueError as e:
1830
+ return jsonify({'error': f'Invalid parameter: {e}'}), 400
1831
+ except Exception as e:
1832
+ logger.error(f'Speech generation failed for {provider_id}: {e}')
1833
+ return jsonify({'error': f'Speech generation failed: {e}'}), 500
1834
+
1835
+ provider_format = provider.get_info().get('audio_format', 'wav')
1836
+ mime_type = 'audio/mpeg' if provider_format == 'mp3' else 'audio/wav'
1837
+ response = make_response(audio_bytes)
1838
+ response.headers['Content-Type'] = mime_type
1839
+ response.headers['Content-Length'] = len(audio_bytes)
1840
+ response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
1841
+ response.headers['X-TTS-Provider'] = provider_id
1842
+ if voice:
1843
+ response.headers['X-TTS-Voice'] = voice
1844
+ return response
1845
+
1846
+ except ValueError as e:
1847
+ return jsonify({'error': f'Invalid input: {e}'}), 400
1848
+ except Exception as e:
1849
+ import traceback
1850
+ logger.error(f'TTS generate endpoint error: {e}')
1851
+ logger.error(traceback.format_exc())
1852
+ return jsonify({'error': 'Internal server error'}), 500
1853
+
1854
+ # ---------------------------------------------------------------------------
1855
+ # POST /api/tts/clone — Clone a voice from audio
1856
+ # ---------------------------------------------------------------------------
1857
+
1858
+
1859
+ @conversation_bp.route('/api/tts/clone', methods=['POST'])
1860
+ def tts_clone_voice():
1861
+ """
1862
+ Clone a voice from an audio sample.
1863
+
1864
+ Accepts either:
1865
+ - JSON: {"audio_url": "...", "name": "...", "reference_text": "..."}
1866
+ - Multipart form: audio file + name field
1867
+
1868
+ Returns: JSON with voice_id, name, embedding metadata.
1869
+ """
1870
+ try:
1871
+ provider = get_provider('qwen3')
1872
+ if not provider.is_available():
1873
+ return jsonify({'error': 'Qwen3 provider not available (FAL_KEY not set)'}), 503
1874
+
1875
+ # JSON mode (audio already hosted at a URL)
1876
+ if request.is_json:
1877
+ data = request.get_json()
1878
+ audio_url = data.get('audio_url', '').strip()
1879
+ name = data.get('name', '').strip()
1880
+ reference_text = data.get('reference_text', '').strip() or None
1881
+
1882
+ if not audio_url:
1883
+ return jsonify({'error': 'audio_url is required'}), 400
1884
+ if not name:
1885
+ return jsonify({'error': 'name is required'}), 400
1886
+
1887
+ # Multipart form mode (upload audio file directly)
1888
+ elif 'audio' in request.files:
1889
+ from services.paths import UPLOADS_DIR
1890
+ import uuid
1891
+
1892
+ audio_file = request.files['audio']
1893
+ name = request.form.get('name', '').strip()
1894
+ reference_text = request.form.get('reference_text', '').strip() or None
1895
+
1896
+ if not name:
1897
+ return jsonify({'error': 'name field is required'}), 400
1898
+ if not audio_file.filename:
1899
+ return jsonify({'error': 'Empty audio file'}), 400
1900
+
1901
+ # Save upload
1902
+ ext = Path(audio_file.filename).suffix.lower()
1903
+ if ext not in ('.wav', '.mp3', '.m4a', '.ogg', '.webm', '.flac'):
1904
+ return jsonify({'error': f'Unsupported audio format: {ext}'}), 400
1905
+
1906
+ safe_name = f"voice_clone_{uuid.uuid4().hex[:12]}{ext}"
1907
+ UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
1908
+ save_path = UPLOADS_DIR / safe_name
1909
+ audio_file.save(str(save_path))
1910
+
1911
+ # Build public URL for fal.ai to fetch
1912
+ audio_url = f"{request.host_url.rstrip('/')}/uploads/{safe_name}"
1913
+ else:
1914
+ return jsonify({
1915
+ 'error': 'Send JSON with audio_url or multipart form with audio file'
1916
+ }), 400
1917
+
1918
+ logger.info(f"Voice clone request: name='{name}', url={audio_url[:80]}")
1919
+ result = provider.clone_voice(
1920
+ audio_url=audio_url,
1921
+ name=name,
1922
+ reference_text=reference_text,
1923
+ )
1924
+
1925
+ return jsonify({
1926
+ 'status': 'ok',
1927
+ 'voice_id': result['voice_id'],
1928
+ 'name': result['name'],
1929
+ 'created_at': result['created_at'],
1930
+ 'clone_time_ms': result['clone_time_ms'],
1931
+ 'embedding_size': result['embedding_size'],
1932
+ 'usage': (
1933
+ f'Use voice_id "{result["voice_id"]}" in /api/tts/generate '
1934
+ f'with provider=qwen3'
1935
+ ),
1936
+ })
1937
+
1938
+ except RuntimeError as e:
1939
+ logger.error(f"Voice clone failed: {e}")
1940
+ return jsonify({'error': str(e)}), 500
1941
+ except Exception as e:
1942
+ import traceback
1943
+ logger.error(f"Voice clone error: {e}")
1944
+ logger.error(traceback.format_exc())
1945
+ return jsonify({'error': 'Internal server error'}), 500
1946
+
1947
+
1948
+ # ---------------------------------------------------------------------------
1949
+ # GET /api/tts/voices — List all voices (built-in + cloned) across providers
1950
+ # ---------------------------------------------------------------------------
1951
+
1952
+
1953
+ @conversation_bp.route('/api/tts/voices', methods=['GET'])
1954
+ def tts_voices_list():
1955
+ """List all available voices across all providers, including cloned voices."""
1956
+ try:
1957
+ all_voices = {}
1958
+ for provider_info in list_providers(include_inactive=False):
1959
+ pid = provider_info.get('provider_id', provider_info.get('name', 'unknown'))
1960
+ voices = provider_info.get('voices', [])
1961
+ cloned = provider_info.get('cloned_voices', [])
1962
+ all_voices[pid] = {
1963
+ 'builtin': voices,
1964
+ 'cloned': cloned,
1965
+ }
1966
+ return jsonify({'voices': all_voices})
1967
+ except Exception as e:
1968
+ logger.error(f"Failed to list voices: {e}")
1969
+ return jsonify({'error': str(e)}), 500
1970
+
1971
+
1972
+ # ---------------------------------------------------------------------------
1973
+ # DELETE /api/tts/voices/<voice_id> — Retire a cloned voice
1974
+ # ---------------------------------------------------------------------------
1975
+
1976
+
1977
+ @conversation_bp.route('/api/tts/voices/<voice_id>', methods=['DELETE'])
1978
+ def tts_delete_voice(voice_id):
1979
+ """Retire a cloned voice embedding (renamed, not deleted)."""
1980
+ try:
1981
+ if not voice_id.startswith('clone_'):
1982
+ return jsonify({'error': 'Can only retire cloned voices (clone_*)'}), 400
1983
+
1984
+ from services.paths import VOICE_CLONES_DIR
1985
+ voice_dir = VOICE_CLONES_DIR / voice_id
1986
+
1987
+ # Validate path doesn't escape
1988
+ try:
1989
+ voice_dir.resolve().relative_to(VOICE_CLONES_DIR.resolve())
1990
+ except ValueError:
1991
+ return jsonify({'error': 'Invalid voice_id'}), 400
1992
+
1993
+ if not voice_dir.exists():
1994
+ return jsonify({'error': f'Voice {voice_id} not found'}), 404
1995
+
1996
+ # Rename to .retired instead of removing (NEVER DELETE rule)
1997
+ renamed = voice_dir.with_name(voice_dir.name + '.retired')
1998
+ voice_dir.rename(renamed)
1999
+ logger.info(f"Cloned voice retired: {voice_id}")
2000
+
2001
+ return jsonify({'status': 'ok', 'voice_id': voice_id, 'action': 'retired'})
2002
+ except Exception as e:
2003
+ logger.error(f"Failed to retire voice {voice_id}: {e}")
2004
+ return jsonify({'error': str(e)}), 500
2005
+
2006
+
2007
+ # ---------------------------------------------------------------------------
2008
+ # POST /api/supertonic-tts (DEPRECATED — use /api/tts/generate)
2009
+ # ---------------------------------------------------------------------------
2010
+
2011
+
2012
+ @conversation_bp.route('/api/supertonic-tts', methods=['POST'])
2013
+ def supertonic_tts_endpoint():
2014
+ """
2015
+ Generate speech via Supertonic TTS (deprecated — prefer /api/tts/generate).
2016
+
2017
+ Request JSON: text, lang, speed, voice_style
2018
+ Returns: WAV audio
2019
+ """
2020
+ try:
2021
+ data = request.get_json()
2022
+ if not data:
2023
+ return jsonify({'error': 'No JSON data provided'}), 400
2024
+
2025
+ text = data.get('text', '').strip()
2026
+ if not text:
2027
+ return jsonify({'error': 'Text cannot be empty'}), 400
2028
+
2029
+ lang = data.get('lang', 'en').lower()
2030
+ if lang not in ['en', 'ko', 'es', 'pt', 'fr']:
2031
+ return jsonify({
2032
+ 'error': f"Invalid language: {lang}. Supported: en, ko, es, pt, fr"
2033
+ }), 400
2034
+
2035
+ speed = float(data.get('speed', 1.0))
2036
+ if speed < 0.5 or speed > 2.0:
2037
+ return jsonify({'error': 'Speed must be between 0.5 and 2.0'}), 400
2038
+
2039
+ voice_style = data.get('voice_style', 'M1').upper()
2040
+ valid_voices = ['M1', 'M2', 'M3', 'M4', 'M5', 'F1', 'F2', 'F3', 'F4', 'F5']
2041
+ if voice_style not in valid_voices:
2042
+ return jsonify({
2043
+ 'error': f"Invalid voice: {voice_style}. "
2044
+ f"Available: {', '.join(valid_voices)}"
2045
+ }), 400
2046
+
2047
+ logger.info(f"Generating speech: {text[:50]}... (lang={lang}, speed={speed})")
2048
+
2049
+ try:
2050
+ tts_instance = get_supertonic_for_voice(voice_style)
2051
+ except Exception as e:
2052
+ logger.error(f'Failed to initialize TTS with voice {voice_style}: {e}')
2053
+ return jsonify({'error': f'Failed to load voice style: {e}'}), 500
2054
+
2055
+ try:
2056
+ audio_bytes = tts_instance.generate_speech(
2057
+ text=text, lang=lang, speed=speed, total_step=16
2058
+ )
2059
+ except Exception as e:
2060
+ logger.error(f'Speech synthesis failed: {e}')
2061
+ return jsonify({'error': f'Speech synthesis failed: {e}'}), 500
2062
+
2063
+ response = make_response(audio_bytes)
2064
+ response.headers['Content-Type'] = 'audio/wav'
2065
+ response.headers['Content-Length'] = len(audio_bytes)
2066
+ response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
2067
+ return response
2068
+
2069
+ except ValueError as e:
2070
+ return jsonify({'error': f'Invalid input: {e}'}), 400
2071
+ except Exception as e:
2072
+ import traceback
2073
+ logger.error(f'TTS endpoint error: {e}')
2074
+ logger.error(traceback.format_exc())
2075
+ return jsonify({'error': 'Internal server error'}), 500
2076
+
2077
+ # ---------------------------------------------------------------------------
2078
+ # POST /api/tts/preview (P4-T5: TTS voice preview)
2079
+ # ---------------------------------------------------------------------------
2080
+
2081
+ _PREVIEW_TEXT = "Hello! This is a preview of the selected voice."
2082
+
2083
+
2084
+ @conversation_bp.route('/api/tts/preview', methods=['POST'])
2085
+ def tts_preview():
2086
+ """
2087
+ Generate a short audio preview for a given TTS voice.
2088
+
2089
+ Request JSON (all optional):
2090
+ provider : str — TTS provider ID (default: 'supertonic')
2091
+ voice : str — Voice ID (default: provider default, e.g. 'M1')
2092
+ text : str — Custom preview text (max 200 chars; default sample phrase)
2093
+
2094
+ Returns JSON:
2095
+ audio_b64 : str — Base64-encoded WAV audio
2096
+ provider : str — Provider used
2097
+ voice : str — Voice used
2098
+ """
2099
+ try:
2100
+ data = request.get_json(silent=True) or {}
2101
+
2102
+ provider_id = str(data.get('provider', 'supertonic')).strip()
2103
+ voice = data.get('voice', None)
2104
+ text = str(data.get('text', _PREVIEW_TEXT)).strip()[:200] or _PREVIEW_TEXT
2105
+
2106
+ # Validate provider exists
2107
+ try:
2108
+ get_provider(provider_id)
2109
+ except ValueError:
2110
+ available = ', '.join([p['provider_id'] for p in list_providers()])
2111
+ return jsonify({
2112
+ 'error': f"Unknown provider: {provider_id}",
2113
+ 'available_providers': available,
2114
+ }), 400
2115
+
2116
+ logger.info(f"TTS preview: provider={provider_id}, voice={voice}, text='{text[:40]}'")
2117
+
2118
+ audio_b64 = _tts_generate_b64(
2119
+ text=text,
2120
+ voice=voice,
2121
+ tts_provider=provider_id,
2122
+ )
2123
+
2124
+ if audio_b64 is None:
2125
+ return jsonify({'error': 'TTS generation failed — check server logs'}), 500
2126
+
2127
+ return jsonify({
2128
+ 'audio_b64': audio_b64,
2129
+ 'provider': provider_id,
2130
+ 'voice': voice or 'default',
2131
+ })
2132
+
2133
+ except Exception as e:
2134
+ import traceback
2135
+ logger.error(f'TTS preview error: {e}')
2136
+ logger.error(traceback.format_exc())
2137
+ return jsonify({'error': 'Internal server error'}), 500
2138
+
2139
+
2140
+ @conversation_bp.route('/api/stt-events', methods=['POST'])
2141
+ def stt_events():
2142
+ """Receive STT error/status events from the browser.
2143
+ Logs them in a format the session monitor can parse from container stdout.
2144
+ Only real errors are sent (no-speech and aborted are filtered client-side).
2145
+ """
2146
+ try:
2147
+ data = request.get_json(silent=True) or {}
2148
+ error_code = data.get('error', 'unknown')
2149
+ message = data.get('message', '')
2150
+ provider = data.get('provider', 'webspeech')
2151
+ source = data.get('source', 'stt') # 'stt' or 'wake_word'
2152
+
2153
+ # Log in session-monitor-parseable format
2154
+ print(f"### STT_ERROR: {error_code} — {message} (provider={provider} source={source})",
2155
+ flush=True)
2156
+ return jsonify({'ok': True})
2157
+ except Exception:
2158
+ return jsonify({'ok': False}), 500