voice-mode 4.4.0__py3-none-any.whl → 4.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voice_mode/__version__.py +1 -1
- voice_mode/cli.py +79 -3
- voice_mode/cli_commands/transcribe.py +7 -6
- voice_mode/config.py +1 -1
- voice_mode/conversation_logger.py +6 -0
- voice_mode/core.py +9 -2
- voice_mode/frontend/.next/BUILD_ID +1 -1
- voice_mode/frontend/.next/app-build-manifest.json +5 -5
- voice_mode/frontend/.next/build-manifest.json +3 -3
- voice_mode/frontend/.next/next-minimal-server.js.nft.json +1 -1
- voice_mode/frontend/.next/next-server.js.nft.json +1 -1
- voice_mode/frontend/.next/prerender-manifest.json +1 -1
- voice_mode/frontend/.next/required-server-files.json +1 -1
- voice_mode/frontend/.next/server/app/_not-found/page.js +1 -1
- voice_mode/frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
- voice_mode/frontend/.next/server/app/_not-found.html +1 -1
- voice_mode/frontend/.next/server/app/_not-found.rsc +1 -1
- voice_mode/frontend/.next/server/app/api/connection-details/route.js +2 -2
- voice_mode/frontend/.next/server/app/favicon.ico/route.js +2 -2
- voice_mode/frontend/.next/server/app/index.html +1 -1
- voice_mode/frontend/.next/server/app/index.rsc +2 -2
- voice_mode/frontend/.next/server/app/page.js +3 -3
- voice_mode/frontend/.next/server/app/page_client-reference-manifest.js +1 -1
- voice_mode/frontend/.next/server/chunks/994.js +1 -1
- voice_mode/frontend/.next/server/middleware-build-manifest.js +1 -1
- voice_mode/frontend/.next/server/next-font-manifest.js +1 -1
- voice_mode/frontend/.next/server/next-font-manifest.json +1 -1
- voice_mode/frontend/.next/server/pages/404.html +1 -1
- voice_mode/frontend/.next/server/pages/500.html +1 -1
- voice_mode/frontend/.next/server/server-reference-manifest.json +1 -1
- voice_mode/frontend/.next/standalone/.next/BUILD_ID +1 -1
- voice_mode/frontend/.next/standalone/.next/app-build-manifest.json +5 -5
- voice_mode/frontend/.next/standalone/.next/build-manifest.json +3 -3
- voice_mode/frontend/.next/standalone/.next/prerender-manifest.json +1 -1
- voice_mode/frontend/.next/standalone/.next/required-server-files.json +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/_not-found.html +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/_not-found.rsc +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js +2 -2
- voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js +2 -2
- voice_mode/frontend/.next/standalone/.next/server/app/index.html +1 -1
- voice_mode/frontend/.next/standalone/.next/server/app/index.rsc +2 -2
- voice_mode/frontend/.next/standalone/.next/server/app/page.js +3 -3
- voice_mode/frontend/.next/standalone/.next/server/app/page_client-reference-manifest.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/chunks/994.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/middleware-build-manifest.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.js +1 -1
- voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.json +1 -1
- voice_mode/frontend/.next/standalone/.next/server/pages/404.html +1 -1
- voice_mode/frontend/.next/standalone/.next/server/pages/500.html +1 -1
- voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.json +1 -1
- voice_mode/frontend/.next/standalone/server.js +1 -1
- voice_mode/frontend/.next/static/chunks/app/layout-d3ec7f6f14ea7396.js +1 -0
- voice_mode/frontend/.next/static/chunks/app/{page-ae0d14863ed895ea.js → page-471796963fb1a4bd.js} +1 -1
- voice_mode/frontend/.next/static/chunks/{main-app-836e76fc70b52220.js → main-app-78da5e437b6a2a9f.js} +1 -1
- voice_mode/frontend/.next/trace +43 -43
- voice_mode/frontend/.next/types/app/api/connection-details/route.ts +1 -1
- voice_mode/frontend/.next/types/app/layout.ts +1 -1
- voice_mode/frontend/.next/types/app/page.ts +1 -1
- voice_mode/frontend/package-lock.json +26 -15
- voice_mode/provider_discovery.py +55 -79
- voice_mode/providers.py +61 -45
- voice_mode/simple_failover.py +41 -12
- voice_mode/tools/__init__.py +138 -30
- voice_mode/tools/converse.py +148 -337
- voice_mode/tools/diagnostics.py +2 -1
- voice_mode/tools/voice_registry.py +24 -28
- {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/METADATA +5 -2
- {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/RECORD +74 -74
- voice_mode/frontend/.next/static/chunks/app/layout-917e8410913fe899.js +0 -1
- /voice_mode/frontend/.next/static/{WhZriRkBKVNPSmCnOFRav → Ni4GIqyDdn0QehvmlLBZg}/_buildManifest.js +0 -0
- /voice_mode/frontend/.next/static/{WhZriRkBKVNPSmCnOFRav → Ni4GIqyDdn0QehvmlLBZg}/_ssgManifest.js +0 -0
- {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/WHEEL +0 -0
- {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/entry_points.txt +0 -0
voice_mode/tools/converse.py
CHANGED
@@ -57,13 +57,6 @@ from voice_mode.config import (
|
|
57
57
|
TTS_MODELS
|
58
58
|
)
|
59
59
|
import voice_mode.config
|
60
|
-
from voice_mode.providers import (
|
61
|
-
get_tts_client_and_voice,
|
62
|
-
get_stt_client,
|
63
|
-
is_provider_available,
|
64
|
-
get_provider_by_voice,
|
65
|
-
select_best_voice
|
66
|
-
)
|
67
60
|
from voice_mode.provider_discovery import provider_registry
|
68
61
|
from voice_mode.core import (
|
69
62
|
get_openai_clients,
|
@@ -159,83 +152,70 @@ async def startup_initialization():
|
|
159
152
|
|
160
153
|
|
161
154
|
async def get_tts_config(provider: Optional[str] = None, voice: Optional[str] = None, model: Optional[str] = None, instructions: Optional[str] = None):
|
162
|
-
"""Get TTS configuration
|
163
|
-
|
164
|
-
|
155
|
+
"""Get TTS configuration - simplified to use direct config"""
|
156
|
+
from voice_mode.provider_discovery import detect_provider_type
|
157
|
+
|
165
158
|
# Validate instructions usage
|
166
159
|
if instructions and model != "gpt-4o-mini-tts":
|
167
160
|
logger.warning(f"Instructions parameter is only supported with gpt-4o-mini-tts model, ignoring for model: {model}")
|
168
161
|
instructions = None
|
169
|
-
|
162
|
+
|
170
163
|
# Map provider names to base URLs
|
171
164
|
provider_urls = {
|
172
165
|
'openai': 'https://api.openai.com/v1',
|
173
166
|
'kokoro': 'http://127.0.0.1:8880/v1'
|
174
167
|
}
|
175
|
-
|
168
|
+
|
176
169
|
# Convert provider name to URL if it's a known provider
|
177
|
-
base_url =
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
'voice': selected_voice,
|
196
|
-
'instructions': instructions,
|
197
|
-
'provider': endpoint_info.base_url, # For logging
|
198
|
-
'provider_type': endpoint_info.provider_type
|
199
|
-
}
|
200
|
-
except Exception as e:
|
201
|
-
logger.error(f"Failed to get TTS client: {e}")
|
202
|
-
# Fallback to legacy behavior
|
203
|
-
return {
|
204
|
-
'client_key': 'tts',
|
205
|
-
'base_url': 'https://api.openai.com/v1', # Fallback to OpenAI
|
206
|
-
'model': model or 'tts-1',
|
207
|
-
'voice': voice or 'alloy',
|
208
|
-
'instructions': instructions,
|
209
|
-
'provider_type': 'openai'
|
210
|
-
}
|
170
|
+
base_url = None
|
171
|
+
if provider:
|
172
|
+
base_url = provider_urls.get(provider, provider)
|
173
|
+
|
174
|
+
# Use first available endpoint from config
|
175
|
+
if not base_url:
|
176
|
+
base_url = TTS_BASE_URLS[0] if TTS_BASE_URLS else 'https://api.openai.com/v1'
|
177
|
+
|
178
|
+
provider_type = detect_provider_type(base_url)
|
179
|
+
|
180
|
+
# Return simplified configuration
|
181
|
+
return {
|
182
|
+
'base_url': base_url,
|
183
|
+
'model': model or TTS_MODELS[0] if TTS_MODELS else 'tts-1',
|
184
|
+
'voice': voice or TTS_VOICES[0] if TTS_VOICES else 'alloy',
|
185
|
+
'instructions': instructions,
|
186
|
+
'provider_type': provider_type
|
187
|
+
}
|
211
188
|
|
212
189
|
|
213
190
|
async def get_stt_config(provider: Optional[str] = None):
|
214
|
-
"""Get STT configuration
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
191
|
+
"""Get STT configuration - simplified to use direct config"""
|
192
|
+
from voice_mode.provider_discovery import detect_provider_type
|
193
|
+
from voice_mode.config import STT_BASE_URLS
|
194
|
+
|
195
|
+
# Map provider names to base URLs
|
196
|
+
provider_urls = {
|
197
|
+
'whisper-local': 'http://127.0.0.1:2022/v1',
|
198
|
+
'openai-whisper': 'https://api.openai.com/v1'
|
199
|
+
}
|
200
|
+
|
201
|
+
# Convert provider name to URL if it's a known provider
|
202
|
+
base_url = None
|
203
|
+
if provider:
|
204
|
+
base_url = provider_urls.get(provider, provider)
|
205
|
+
|
206
|
+
# Use first available endpoint from config
|
207
|
+
if not base_url:
|
208
|
+
base_url = STT_BASE_URLS[0] if STT_BASE_URLS else 'https://api.openai.com/v1'
|
209
|
+
|
210
|
+
provider_type = detect_provider_type(base_url)
|
211
|
+
|
212
|
+
# Return simplified configuration
|
213
|
+
return {
|
214
|
+
'base_url': base_url,
|
215
|
+
'model': 'whisper-1',
|
216
|
+
'provider': 'whisper-local' if '127.0.0.1' in base_url or 'localhost' in base_url else 'openai-whisper',
|
217
|
+
'provider_type': provider_type
|
218
|
+
}
|
239
219
|
|
240
220
|
|
241
221
|
|
@@ -254,162 +234,25 @@ async def text_to_speech_with_failover(
|
|
254
234
|
Returns:
|
255
235
|
Tuple of (success, tts_metrics, tts_config)
|
256
236
|
"""
|
257
|
-
from voice_mode.config import SIMPLE_FAILOVER
|
258
|
-
|
259
237
|
# Apply pronunciation rules if enabled
|
260
238
|
if pronounce_enabled():
|
261
239
|
pronounce_mgr = get_pronounce_manager()
|
262
240
|
message = pronounce_mgr.process_tts(message)
|
263
|
-
|
264
|
-
#
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
)
|
279
|
-
|
280
|
-
# Original implementation with health checks
|
281
|
-
from voice_mode.provider_discovery import provider_registry
|
282
|
-
|
283
|
-
# Track which URLs we've tried
|
284
|
-
tried_urls = set()
|
285
|
-
last_error = None
|
286
|
-
|
287
|
-
# If initial_provider specified, try it first
|
288
|
-
if initial_provider:
|
289
|
-
provider_urls = {'openai': 'https://api.openai.com/v1', 'kokoro': 'http://127.0.0.1:8880/v1'}
|
290
|
-
initial_url = provider_urls.get(initial_provider, initial_provider)
|
291
|
-
if initial_url:
|
292
|
-
tried_urls.add(initial_url)
|
293
|
-
try:
|
294
|
-
tts_config = await get_tts_config(initial_provider, voice, model, instructions)
|
295
|
-
|
296
|
-
# Handle both new client object and legacy client_key
|
297
|
-
if 'client' in tts_config:
|
298
|
-
openai_clients['_temp_tts'] = tts_config['client']
|
299
|
-
client_key = '_temp_tts'
|
300
|
-
else:
|
301
|
-
client_key = tts_config.get('client_key', 'tts')
|
302
|
-
|
303
|
-
# Get conversation ID from logger
|
304
|
-
conversation_logger = get_conversation_logger()
|
305
|
-
conversation_id = conversation_logger.conversation_id
|
306
|
-
|
307
|
-
success, tts_metrics = await text_to_speech(
|
308
|
-
text=message,
|
309
|
-
openai_clients=openai_clients,
|
310
|
-
tts_model=tts_config['model'],
|
311
|
-
tts_base_url=tts_config['base_url'],
|
312
|
-
tts_voice=tts_config['voice'],
|
313
|
-
debug=DEBUG,
|
314
|
-
debug_dir=DEBUG_DIR if DEBUG else None,
|
315
|
-
save_audio=SAVE_AUDIO,
|
316
|
-
audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
|
317
|
-
client_key=client_key,
|
318
|
-
instructions=tts_config.get('instructions'),
|
319
|
-
audio_format=audio_format,
|
320
|
-
conversation_id=conversation_id,
|
321
|
-
speed=speed
|
322
|
-
)
|
323
|
-
|
324
|
-
# Clean up temporary client
|
325
|
-
if '_temp_tts' in openai_clients:
|
326
|
-
del openai_clients['_temp_tts']
|
327
|
-
|
328
|
-
if success:
|
329
|
-
return success, tts_metrics, tts_config
|
330
|
-
|
331
|
-
# Mark endpoint as unhealthy
|
332
|
-
await provider_registry.mark_unhealthy('tts', tts_config['base_url'], 'TTS request failed')
|
333
|
-
|
334
|
-
except Exception as e:
|
335
|
-
last_error = str(e)
|
336
|
-
logger.warning(f"Initial provider {initial_provider} failed: {e}")
|
337
|
-
logger.debug(f"Full error details for {initial_provider}:", exc_info=True)
|
338
|
-
|
339
|
-
# Try remaining endpoints in order
|
340
|
-
from voice_mode.config import TTS_BASE_URLS
|
341
|
-
|
342
|
-
for base_url in TTS_BASE_URLS:
|
343
|
-
if base_url in tried_urls:
|
344
|
-
continue
|
345
|
-
|
346
|
-
tried_urls.add(base_url)
|
347
|
-
|
348
|
-
try:
|
349
|
-
# Try to get config for this specific base URL
|
350
|
-
tts_config = await get_tts_config(None, voice, model, instructions)
|
351
|
-
|
352
|
-
# Skip if we got a different URL than requested (means our preferred wasn't available)
|
353
|
-
if tts_config.get('base_url') != base_url:
|
354
|
-
continue
|
355
|
-
|
356
|
-
# Handle both new client object and legacy client_key
|
357
|
-
if 'client' in tts_config:
|
358
|
-
openai_clients['_temp_tts'] = tts_config['client']
|
359
|
-
client_key = '_temp_tts'
|
360
|
-
else:
|
361
|
-
client_key = tts_config.get('client_key', 'tts')
|
362
|
-
|
363
|
-
# Get conversation ID from logger
|
364
|
-
conversation_logger = get_conversation_logger()
|
365
|
-
conversation_id = conversation_logger.conversation_id
|
366
|
-
|
367
|
-
success, tts_metrics = await text_to_speech(
|
368
|
-
text=message,
|
369
|
-
openai_clients=openai_clients,
|
370
|
-
tts_model=tts_config['model'],
|
371
|
-
tts_base_url=tts_config['base_url'],
|
372
|
-
tts_voice=tts_config['voice'],
|
373
|
-
debug=DEBUG,
|
374
|
-
debug_dir=DEBUG_DIR if DEBUG else None,
|
375
|
-
save_audio=SAVE_AUDIO,
|
376
|
-
audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
|
377
|
-
client_key=client_key,
|
378
|
-
instructions=tts_config.get('instructions'),
|
379
|
-
audio_format=audio_format,
|
380
|
-
conversation_id=conversation_id,
|
381
|
-
speed=speed
|
382
|
-
)
|
383
|
-
|
384
|
-
# Clean up temporary client
|
385
|
-
if '_temp_tts' in openai_clients:
|
386
|
-
del openai_clients['_temp_tts']
|
387
|
-
|
388
|
-
if success:
|
389
|
-
logger.info(f"TTS succeeded with failover to: {base_url}")
|
390
|
-
return success, tts_metrics, tts_config
|
391
|
-
else:
|
392
|
-
# Mark endpoint as unhealthy
|
393
|
-
await provider_registry.mark_unhealthy('tts', base_url, 'TTS request failed')
|
394
|
-
|
395
|
-
except Exception as e:
|
396
|
-
last_error = str(e)
|
397
|
-
logger.warning(f"TTS failed for {base_url}: {e}")
|
398
|
-
# Mark endpoint as unhealthy
|
399
|
-
await provider_registry.mark_unhealthy('tts', base_url, str(e))
|
400
|
-
|
401
|
-
# All endpoints failed
|
402
|
-
logger.error(f"All TTS endpoints failed. Last error: {last_error}")
|
403
|
-
|
404
|
-
# Create a config dict with error information
|
405
|
-
from voice_mode.config import TTS_BASE_URLS as CONFIG_TTS_BASE_URLS
|
406
|
-
error_config = {
|
407
|
-
'error': last_error,
|
408
|
-
'tried_urls': list(tried_urls),
|
409
|
-
'base_url': CONFIG_TTS_BASE_URLS[0] if CONFIG_TTS_BASE_URLS else 'https://api.openai.com/v1'
|
410
|
-
}
|
411
|
-
|
412
|
-
return False, None, error_config
|
241
|
+
|
242
|
+
# Always use simple failover (the only mode now)
|
243
|
+
from voice_mode.simple_failover import simple_tts_failover
|
244
|
+
return await simple_tts_failover(
|
245
|
+
text=message,
|
246
|
+
voice=voice or TTS_VOICES[0],
|
247
|
+
model=model or TTS_MODELS[0],
|
248
|
+
instructions=instructions,
|
249
|
+
audio_format=audio_format,
|
250
|
+
debug=DEBUG,
|
251
|
+
debug_dir=DEBUG_DIR if DEBUG else None,
|
252
|
+
save_audio=SAVE_AUDIO,
|
253
|
+
audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
|
254
|
+
speed=speed
|
255
|
+
)
|
413
256
|
|
414
257
|
|
415
258
|
async def speech_to_text(audio_data: np.ndarray, save_audio: bool = False, audio_dir: Optional[Path] = None, transport: str = "local") -> Optional[str]:
|
@@ -430,122 +273,71 @@ async def speech_to_text_with_failover(
|
|
430
273
|
Returns:
|
431
274
|
Transcribed text or None if all endpoints fail
|
432
275
|
"""
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
276
|
+
# Always use simple failover (the only mode now)
|
277
|
+
import tempfile
|
278
|
+
from voice_mode.conversation_logger import get_conversation_logger
|
279
|
+
from voice_mode.core import save_debug_file, get_debug_filename
|
280
|
+
|
281
|
+
# Determine if we should save the file permanently or use a temp file
|
282
|
+
if save_audio and audio_dir:
|
283
|
+
# Save directly to final location
|
284
|
+
conversation_logger = get_conversation_logger()
|
285
|
+
conversation_id = conversation_logger.conversation_id
|
286
|
+
|
287
|
+
# Create year/month directory structure
|
288
|
+
now = datetime.now()
|
289
|
+
year_dir = audio_dir / str(now.year)
|
290
|
+
month_dir = year_dir / f"{now.month:02d}"
|
291
|
+
month_dir.mkdir(parents=True, exist_ok=True)
|
292
|
+
|
293
|
+
# Generate filename and path
|
294
|
+
filename = get_debug_filename("stt", "wav", conversation_id)
|
295
|
+
wav_file_path = month_dir / filename
|
296
|
+
|
297
|
+
# Write audio data directly to final location
|
298
|
+
write(str(wav_file_path), SAMPLE_RATE, audio_data)
|
299
|
+
logger.info(f"STT audio saved to: {wav_file_path}")
|
300
|
+
|
301
|
+
# Use the saved file for STT
|
302
|
+
with open(wav_file_path, 'rb') as audio_file:
|
303
|
+
from voice_mode.simple_failover import simple_stt_failover
|
304
|
+
stt_result = await simple_stt_failover(
|
305
|
+
audio_file=audio_file,
|
306
|
+
model="whisper-1"
|
307
|
+
)
|
308
|
+
# Extract text and log provider info
|
309
|
+
if isinstance(stt_result, dict):
|
310
|
+
result = stt_result.get("text")
|
311
|
+
provider = stt_result.get("provider", "unknown")
|
312
|
+
logger.info(f"STT Provider Used: {provider}")
|
313
|
+
else:
|
314
|
+
# Backward compatibility if old version
|
315
|
+
result = stt_result
|
316
|
+
# Don't delete - it's our saved audio file
|
317
|
+
else:
|
318
|
+
# Use temporary file that will be deleted
|
319
|
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
320
|
+
write(tmp_file.name, SAMPLE_RATE, audio_data)
|
321
|
+
tmp_file.flush()
|
322
|
+
|
323
|
+
with open(tmp_file.name, 'rb') as audio_file:
|
463
324
|
from voice_mode.simple_failover import simple_stt_failover
|
464
|
-
|
325
|
+
stt_result = await simple_stt_failover(
|
465
326
|
audio_file=audio_file,
|
466
327
|
model="whisper-1"
|
467
328
|
)
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
)
|
481
|
-
|
482
|
-
# Clean up temp file
|
483
|
-
os.unlink(tmp_file.name)
|
484
|
-
|
485
|
-
return result
|
486
|
-
|
487
|
-
# Original implementation with health checks
|
488
|
-
from voice_mode.provider_discovery import provider_registry
|
489
|
-
|
490
|
-
# Track which URLs we've tried
|
491
|
-
tried_urls = set()
|
492
|
-
last_error = None
|
493
|
-
|
494
|
-
# Try configured endpoints in order
|
495
|
-
for base_url in STT_BASE_URLS:
|
496
|
-
if base_url in tried_urls:
|
497
|
-
continue
|
498
|
-
|
499
|
-
tried_urls.add(base_url)
|
500
|
-
|
501
|
-
try:
|
502
|
-
# Get STT config for this specific endpoint
|
503
|
-
client, selected_model, endpoint_info = await get_stt_client(base_url=base_url)
|
504
|
-
|
505
|
-
if not client:
|
506
|
-
logger.warning(f"No STT client available for {base_url}")
|
507
|
-
continue
|
508
|
-
|
509
|
-
from voice_mode.provider_discovery import detect_provider_type
|
510
|
-
|
511
|
-
stt_config = {
|
512
|
-
'client': client,
|
513
|
-
'model': selected_model,
|
514
|
-
'base_url': endpoint_info.base_url if endpoint_info else base_url,
|
515
|
-
'provider': 'whisper-local' if '127.0.0.1' in base_url or 'localhost' in base_url else 'openai-whisper',
|
516
|
-
'provider_type': detect_provider_type(endpoint_info.base_url if endpoint_info else base_url)
|
517
|
-
}
|
518
|
-
|
519
|
-
logger.info(f"Attempting STT with {stt_config['provider']} at {stt_config['base_url']}")
|
520
|
-
|
521
|
-
# Create openai_clients dict with temporary STT client
|
522
|
-
openai_clients = {'_temp_stt': client}
|
523
|
-
|
524
|
-
# Call original speech_to_text with this config
|
525
|
-
result = await _speech_to_text_internal(
|
526
|
-
audio_data,
|
527
|
-
stt_config,
|
528
|
-
openai_clients,
|
529
|
-
save_audio,
|
530
|
-
audio_dir
|
531
|
-
)
|
532
|
-
|
533
|
-
if result:
|
534
|
-
logger.info(f"STT succeeded with {stt_config['provider']}")
|
535
|
-
return result
|
536
|
-
else:
|
537
|
-
# Mark endpoint as unhealthy if it returned None
|
538
|
-
await provider_registry.mark_unhealthy('stt', base_url, 'STT returned no result')
|
539
|
-
|
540
|
-
except Exception as e:
|
541
|
-
last_error = str(e)
|
542
|
-
logger.warning(f"STT failed for {base_url}: {e}")
|
543
|
-
# Mark endpoint as unhealthy
|
544
|
-
await provider_registry.mark_unhealthy('stt', base_url, str(e))
|
545
|
-
|
546
|
-
# All endpoints failed
|
547
|
-
logger.error(f"All STT endpoints failed. Last error: {last_error}")
|
548
|
-
return None
|
329
|
+
# Return dict with text and provider
|
330
|
+
if isinstance(stt_result, dict):
|
331
|
+
result = stt_result
|
332
|
+
logger.info(f"STT Provider Used: {stt_result.get('provider', 'unknown')}")
|
333
|
+
else:
|
334
|
+
# Backward compatibility - wrap in dict
|
335
|
+
result = {"text": stt_result, "provider": "unknown"} if stt_result else None
|
336
|
+
|
337
|
+
# Clean up temp file
|
338
|
+
os.unlink(tmp_file.name)
|
339
|
+
|
340
|
+
return result
|
549
341
|
|
550
342
|
|
551
343
|
async def _speech_to_text_internal(
|
@@ -732,6 +524,8 @@ async def _speech_to_text_internal(
|
|
732
524
|
provider_type=stt_config.get('provider_type'),
|
733
525
|
audio_format=export_format, # Use actual format from conversion
|
734
526
|
transport=transport,
|
527
|
+
is_fallback=stt_config.get('is_fallback', False),
|
528
|
+
fallback_reason=stt_config.get('fallback_reason'),
|
735
529
|
silence_detection={
|
736
530
|
"enabled": not DISABLE_SILENCE_DETECTION,
|
737
531
|
"vad_aggressiveness": VAD_AGGRESSIVENESS,
|
@@ -1771,6 +1565,8 @@ async def converse(
|
|
1771
1565
|
provider=tts_config.get('provider') if tts_config else (tts_provider if tts_provider else 'openai'),
|
1772
1566
|
provider_url=tts_config.get('base_url') if tts_config else None,
|
1773
1567
|
provider_type=tts_config.get('provider_type') if tts_config else None,
|
1568
|
+
is_fallback=tts_config.get('is_fallback', False) if tts_config else False,
|
1569
|
+
fallback_reason=tts_config.get('fallback_reason') if tts_config else None,
|
1774
1570
|
timing=timing_str,
|
1775
1571
|
audio_format=audio_format,
|
1776
1572
|
transport="speak-only",
|
@@ -1901,6 +1697,8 @@ async def converse(
|
|
1901
1697
|
provider=tts_config.get('provider') if tts_config else (tts_provider if tts_provider else 'openai'),
|
1902
1698
|
provider_url=tts_config.get('base_url') if tts_config else None,
|
1903
1699
|
provider_type=tts_config.get('provider_type') if tts_config else None,
|
1700
|
+
is_fallback=tts_config.get('is_fallback', False) if tts_config else False,
|
1701
|
+
fallback_reason=tts_config.get('fallback_reason') if tts_config else None,
|
1904
1702
|
timing=tts_timing_str,
|
1905
1703
|
audio_format=audio_format,
|
1906
1704
|
transport=transport,
|
@@ -1997,8 +1795,19 @@ async def converse(
|
|
1997
1795
|
event_logger.log_event(event_logger.STT_START)
|
1998
1796
|
|
1999
1797
|
stt_start = time.perf_counter()
|
2000
|
-
|
1798
|
+
stt_result = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
|
2001
1799
|
timings['stt'] = time.perf_counter() - stt_start
|
1800
|
+
|
1801
|
+
# Extract text and provider from result
|
1802
|
+
if isinstance(stt_result, dict):
|
1803
|
+
response_text = stt_result.get("text")
|
1804
|
+
stt_provider = stt_result.get("provider", "unknown")
|
1805
|
+
if stt_provider != "unknown":
|
1806
|
+
logger.info(f"📡 STT Provider: {stt_provider}")
|
1807
|
+
else:
|
1808
|
+
# Backward compatibility
|
1809
|
+
response_text = stt_result
|
1810
|
+
stt_provider = "unknown"
|
2002
1811
|
|
2003
1812
|
# Log STT complete
|
2004
1813
|
if event_logger:
|
@@ -2110,7 +1919,9 @@ async def converse(
|
|
2110
1919
|
|
2111
1920
|
# Logging already done immediately after TTS and STT complete
|
2112
1921
|
|
2113
|
-
|
1922
|
+
# Include STT provider in result if known
|
1923
|
+
stt_info = f" (STT: {stt_provider})" if 'stt_provider' in locals() and stt_provider != "unknown" else ""
|
1924
|
+
result = f"Voice response: {response_text}{stt_info} | Timing: {timing_str}"
|
2114
1925
|
success = True
|
2115
1926
|
else:
|
2116
1927
|
result = f"No speech detected | Timing: {timing_str}"
|
voice_mode/tools/diagnostics.py
CHANGED
@@ -34,7 +34,8 @@ async def voice_mode_info() -> str:
|
|
34
34
|
for service_type in ["tts", "stt"]:
|
35
35
|
info.append(f"\n{service_type.upper()} Endpoints:")
|
36
36
|
for url, endpoint in provider_registry.registry[service_type].items():
|
37
|
-
status
|
37
|
+
# Show status based on whether endpoint has an error
|
38
|
+
status = "❌" if endpoint.last_error else "✅"
|
38
39
|
info.append(f" {status} {url} ({endpoint.provider_type})")
|
39
40
|
if service_type == "tts" and endpoint.voices:
|
40
41
|
info.append(f" Voices: {', '.join(endpoint.voices[:3])}...")
|
@@ -7,14 +7,14 @@ from voice_mode.provider_discovery import provider_registry
|
|
7
7
|
@mcp.tool()
|
8
8
|
async def voice_registry() -> str:
|
9
9
|
"""Get the current voice provider registry showing all discovered endpoints.
|
10
|
-
|
10
|
+
|
11
11
|
Returns a formatted view of all TTS and STT endpoints with their:
|
12
|
-
- Health status
|
13
12
|
- Available models
|
14
13
|
- Available voices (TTS only)
|
15
|
-
-
|
16
|
-
- Last
|
17
|
-
|
14
|
+
- Provider type
|
15
|
+
- Last check time
|
16
|
+
- Any recent errors
|
17
|
+
|
18
18
|
This allows the LLM to see what voice services are currently available.
|
19
19
|
"""
|
20
20
|
# Ensure registry is initialized
|
@@ -31,36 +31,32 @@ async def voice_registry() -> str:
|
|
31
31
|
lines.append("-" * 30)
|
32
32
|
|
33
33
|
for url, info in registry_data["tts"].items():
|
34
|
-
status = "
|
34
|
+
status = "❌" if info.get("last_error") else "✅"
|
35
35
|
lines.append(f"\n{status} {url}")
|
36
|
-
|
37
|
-
if info["
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
lines.append(f" Last Check: {info['last_check']}")
|
36
|
+
lines.append(f" Provider: {info.get('provider_type', 'unknown')}")
|
37
|
+
lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
|
38
|
+
lines.append(f" Voices: {', '.join(info['voices']) if info['voices'] else 'none detected'}")
|
39
|
+
|
40
|
+
if info.get("last_error"):
|
41
|
+
lines.append(f" Last Error: {info['last_error']}")
|
42
|
+
|
43
|
+
if info.get('last_check'):
|
44
|
+
lines.append(f" Last Check: {info['last_check']}")
|
47
45
|
|
48
46
|
# STT Endpoints
|
49
47
|
lines.append("\n\nSTT Endpoints:")
|
50
48
|
lines.append("-" * 30)
|
51
49
|
|
52
50
|
for url, info in registry_data["stt"].items():
|
53
|
-
status = "
|
51
|
+
status = "❌" if info.get("last_error") else "✅"
|
54
52
|
lines.append(f"\n{status} {url}")
|
55
|
-
|
56
|
-
if info["
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
lines.append(f" Last Check: {info['last_check']}")
|
53
|
+
lines.append(f" Provider: {info.get('provider_type', 'unknown')}")
|
54
|
+
lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
|
55
|
+
|
56
|
+
if info.get("last_error"):
|
57
|
+
lines.append(f" Last Error: {info['last_error']}")
|
58
|
+
|
59
|
+
if info.get('last_check'):
|
60
|
+
lines.append(f" Last Check: {info['last_check']}")
|
65
61
|
|
66
62
|
return "\n".join(lines)
|