voice-mode 4.4.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. voice_mode/__version__.py +1 -1
  2. voice_mode/cli.py +79 -3
  3. voice_mode/cli_commands/transcribe.py +7 -6
  4. voice_mode/config.py +1 -1
  5. voice_mode/conversation_logger.py +6 -0
  6. voice_mode/core.py +9 -2
  7. voice_mode/frontend/.next/BUILD_ID +1 -1
  8. voice_mode/frontend/.next/app-build-manifest.json +5 -5
  9. voice_mode/frontend/.next/build-manifest.json +3 -3
  10. voice_mode/frontend/.next/next-minimal-server.js.nft.json +1 -1
  11. voice_mode/frontend/.next/next-server.js.nft.json +1 -1
  12. voice_mode/frontend/.next/prerender-manifest.json +1 -1
  13. voice_mode/frontend/.next/required-server-files.json +1 -1
  14. voice_mode/frontend/.next/server/app/_not-found/page.js +1 -1
  15. voice_mode/frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
  16. voice_mode/frontend/.next/server/app/_not-found.html +1 -1
  17. voice_mode/frontend/.next/server/app/_not-found.rsc +1 -1
  18. voice_mode/frontend/.next/server/app/api/connection-details/route.js +2 -2
  19. voice_mode/frontend/.next/server/app/favicon.ico/route.js +2 -2
  20. voice_mode/frontend/.next/server/app/index.html +1 -1
  21. voice_mode/frontend/.next/server/app/index.rsc +2 -2
  22. voice_mode/frontend/.next/server/app/page.js +3 -3
  23. voice_mode/frontend/.next/server/app/page_client-reference-manifest.js +1 -1
  24. voice_mode/frontend/.next/server/chunks/994.js +1 -1
  25. voice_mode/frontend/.next/server/middleware-build-manifest.js +1 -1
  26. voice_mode/frontend/.next/server/next-font-manifest.js +1 -1
  27. voice_mode/frontend/.next/server/next-font-manifest.json +1 -1
  28. voice_mode/frontend/.next/server/pages/404.html +1 -1
  29. voice_mode/frontend/.next/server/pages/500.html +1 -1
  30. voice_mode/frontend/.next/server/server-reference-manifest.json +1 -1
  31. voice_mode/frontend/.next/standalone/.next/BUILD_ID +1 -1
  32. voice_mode/frontend/.next/standalone/.next/app-build-manifest.json +5 -5
  33. voice_mode/frontend/.next/standalone/.next/build-manifest.json +3 -3
  34. voice_mode/frontend/.next/standalone/.next/prerender-manifest.json +1 -1
  35. voice_mode/frontend/.next/standalone/.next/required-server-files.json +1 -1
  36. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js +1 -1
  37. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
  38. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.html +1 -1
  39. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.rsc +1 -1
  40. voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js +2 -2
  41. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js +2 -2
  42. voice_mode/frontend/.next/standalone/.next/server/app/index.html +1 -1
  43. voice_mode/frontend/.next/standalone/.next/server/app/index.rsc +2 -2
  44. voice_mode/frontend/.next/standalone/.next/server/app/page.js +3 -3
  45. voice_mode/frontend/.next/standalone/.next/server/app/page_client-reference-manifest.js +1 -1
  46. voice_mode/frontend/.next/standalone/.next/server/chunks/994.js +1 -1
  47. voice_mode/frontend/.next/standalone/.next/server/middleware-build-manifest.js +1 -1
  48. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.js +1 -1
  49. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.json +1 -1
  50. voice_mode/frontend/.next/standalone/.next/server/pages/404.html +1 -1
  51. voice_mode/frontend/.next/standalone/.next/server/pages/500.html +1 -1
  52. voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.json +1 -1
  53. voice_mode/frontend/.next/standalone/server.js +1 -1
  54. voice_mode/frontend/.next/static/chunks/app/layout-d3ec7f6f14ea7396.js +1 -0
  55. voice_mode/frontend/.next/static/chunks/app/{page-ae0d14863ed895ea.js → page-471796963fb1a4bd.js} +1 -1
  56. voice_mode/frontend/.next/static/chunks/{main-app-836e76fc70b52220.js → main-app-78da5e437b6a2a9f.js} +1 -1
  57. voice_mode/frontend/.next/trace +43 -43
  58. voice_mode/frontend/.next/types/app/api/connection-details/route.ts +1 -1
  59. voice_mode/frontend/.next/types/app/layout.ts +1 -1
  60. voice_mode/frontend/.next/types/app/page.ts +1 -1
  61. voice_mode/frontend/package-lock.json +26 -15
  62. voice_mode/provider_discovery.py +55 -79
  63. voice_mode/providers.py +61 -45
  64. voice_mode/simple_failover.py +41 -12
  65. voice_mode/tools/__init__.py +138 -30
  66. voice_mode/tools/converse.py +148 -337
  67. voice_mode/tools/diagnostics.py +2 -1
  68. voice_mode/tools/voice_registry.py +24 -28
  69. {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/METADATA +5 -2
  70. {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/RECORD +74 -74
  71. voice_mode/frontend/.next/static/chunks/app/layout-917e8410913fe899.js +0 -1
  72. /voice_mode/frontend/.next/static/{WhZriRkBKVNPSmCnOFRav → Ni4GIqyDdn0QehvmlLBZg}/_buildManifest.js +0 -0
  73. /voice_mode/frontend/.next/static/{WhZriRkBKVNPSmCnOFRav → Ni4GIqyDdn0QehvmlLBZg}/_ssgManifest.js +0 -0
  74. {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/WHEEL +0 -0
  75. {voice_mode-4.4.0.dist-info → voice_mode-4.5.0.dist-info}/entry_points.txt +0 -0
@@ -57,13 +57,6 @@ from voice_mode.config import (
57
57
  TTS_MODELS
58
58
  )
59
59
  import voice_mode.config
60
- from voice_mode.providers import (
61
- get_tts_client_and_voice,
62
- get_stt_client,
63
- is_provider_available,
64
- get_provider_by_voice,
65
- select_best_voice
66
- )
67
60
  from voice_mode.provider_discovery import provider_registry
68
61
  from voice_mode.core import (
69
62
  get_openai_clients,
@@ -159,83 +152,70 @@ async def startup_initialization():
159
152
 
160
153
 
161
154
  async def get_tts_config(provider: Optional[str] = None, voice: Optional[str] = None, model: Optional[str] = None, instructions: Optional[str] = None):
162
- """Get TTS configuration based on provider selection"""
163
- logger.info(f"[DEBUG] get_tts_config called with provider={provider}, voice={voice}, model={model}")
164
-
155
+ """Get TTS configuration - simplified to use direct config"""
156
+ from voice_mode.provider_discovery import detect_provider_type
157
+
165
158
  # Validate instructions usage
166
159
  if instructions and model != "gpt-4o-mini-tts":
167
160
  logger.warning(f"Instructions parameter is only supported with gpt-4o-mini-tts model, ignoring for model: {model}")
168
161
  instructions = None
169
-
162
+
170
163
  # Map provider names to base URLs
171
164
  provider_urls = {
172
165
  'openai': 'https://api.openai.com/v1',
173
166
  'kokoro': 'http://127.0.0.1:8880/v1'
174
167
  }
175
-
168
+
176
169
  # Convert provider name to URL if it's a known provider
177
- base_url = provider_urls.get(provider, provider)
178
-
179
- # Use new provider selection logic
180
- try:
181
- client, selected_voice, selected_model, endpoint_info = await get_tts_client_and_voice(
182
- voice=voice,
183
- model=model,
184
- base_url=base_url # Now using mapped URL
185
- )
186
-
187
- # Return configuration compatible with existing code
188
- logger.info(f"[DEBUG] TTS endpoint selected: {endpoint_info.base_url} (provider: {endpoint_info.provider_type})")
189
- logger.info(f"[DEBUG] Using voice: {selected_voice}, model: {selected_model}")
190
-
191
- return {
192
- 'client': client,
193
- 'base_url': endpoint_info.base_url,
194
- 'model': selected_model,
195
- 'voice': selected_voice,
196
- 'instructions': instructions,
197
- 'provider': endpoint_info.base_url, # For logging
198
- 'provider_type': endpoint_info.provider_type
199
- }
200
- except Exception as e:
201
- logger.error(f"Failed to get TTS client: {e}")
202
- # Fallback to legacy behavior
203
- return {
204
- 'client_key': 'tts',
205
- 'base_url': 'https://api.openai.com/v1', # Fallback to OpenAI
206
- 'model': model or 'tts-1',
207
- 'voice': voice or 'alloy',
208
- 'instructions': instructions,
209
- 'provider_type': 'openai'
210
- }
170
+ base_url = None
171
+ if provider:
172
+ base_url = provider_urls.get(provider, provider)
173
+
174
+ # Use first available endpoint from config
175
+ if not base_url:
176
+ base_url = TTS_BASE_URLS[0] if TTS_BASE_URLS else 'https://api.openai.com/v1'
177
+
178
+ provider_type = detect_provider_type(base_url)
179
+
180
+ # Return simplified configuration
181
+ return {
182
+ 'base_url': base_url,
183
+ 'model': model or TTS_MODELS[0] if TTS_MODELS else 'tts-1',
184
+ 'voice': voice or TTS_VOICES[0] if TTS_VOICES else 'alloy',
185
+ 'instructions': instructions,
186
+ 'provider_type': provider_type
187
+ }
211
188
 
212
189
 
213
190
  async def get_stt_config(provider: Optional[str] = None):
214
- """Get STT configuration based on provider selection"""
215
- try:
216
- # Use new provider selection logic
217
- client, selected_model, endpoint_info = await get_stt_client(
218
- model=None, # Let system select
219
- base_url=provider # Allow provider to be a base URL
220
- )
221
-
222
- return {
223
- 'client': client,
224
- 'base_url': endpoint_info.base_url,
225
- 'model': selected_model,
226
- 'provider': endpoint_info.base_url, # For logging
227
- 'provider_type': endpoint_info.provider_type
228
- }
229
- except Exception as e:
230
- logger.error(f"Failed to get STT client: {e}")
231
- # Fallback to legacy behavior
232
- return {
233
- 'client_key': 'stt',
234
- 'base_url': 'https://api.openai.com/v1', # Fallback to OpenAI
235
- 'model': 'whisper-1',
236
- 'provider': 'openai-whisper',
237
- 'provider_type': 'openai'
238
- }
191
+ """Get STT configuration - simplified to use direct config"""
192
+ from voice_mode.provider_discovery import detect_provider_type
193
+ from voice_mode.config import STT_BASE_URLS
194
+
195
+ # Map provider names to base URLs
196
+ provider_urls = {
197
+ 'whisper-local': 'http://127.0.0.1:2022/v1',
198
+ 'openai-whisper': 'https://api.openai.com/v1'
199
+ }
200
+
201
+ # Convert provider name to URL if it's a known provider
202
+ base_url = None
203
+ if provider:
204
+ base_url = provider_urls.get(provider, provider)
205
+
206
+ # Use first available endpoint from config
207
+ if not base_url:
208
+ base_url = STT_BASE_URLS[0] if STT_BASE_URLS else 'https://api.openai.com/v1'
209
+
210
+ provider_type = detect_provider_type(base_url)
211
+
212
+ # Return simplified configuration
213
+ return {
214
+ 'base_url': base_url,
215
+ 'model': 'whisper-1',
216
+ 'provider': 'whisper-local' if '127.0.0.1' in base_url or 'localhost' in base_url else 'openai-whisper',
217
+ 'provider_type': provider_type
218
+ }
239
219
 
240
220
 
241
221
 
@@ -254,162 +234,25 @@ async def text_to_speech_with_failover(
254
234
  Returns:
255
235
  Tuple of (success, tts_metrics, tts_config)
256
236
  """
257
- from voice_mode.config import SIMPLE_FAILOVER
258
-
259
237
  # Apply pronunciation rules if enabled
260
238
  if pronounce_enabled():
261
239
  pronounce_mgr = get_pronounce_manager()
262
240
  message = pronounce_mgr.process_tts(message)
263
-
264
- # Use simple failover if enabled
265
- if SIMPLE_FAILOVER:
266
- from voice_mode.simple_failover import simple_tts_failover
267
- return await simple_tts_failover(
268
- text=message,
269
- voice=voice or TTS_VOICES[0],
270
- model=model or TTS_MODELS[0],
271
- instructions=instructions,
272
- audio_format=audio_format,
273
- debug=DEBUG,
274
- debug_dir=DEBUG_DIR if DEBUG else None,
275
- save_audio=SAVE_AUDIO,
276
- audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
277
- speed=speed
278
- )
279
-
280
- # Original implementation with health checks
281
- from voice_mode.provider_discovery import provider_registry
282
-
283
- # Track which URLs we've tried
284
- tried_urls = set()
285
- last_error = None
286
-
287
- # If initial_provider specified, try it first
288
- if initial_provider:
289
- provider_urls = {'openai': 'https://api.openai.com/v1', 'kokoro': 'http://127.0.0.1:8880/v1'}
290
- initial_url = provider_urls.get(initial_provider, initial_provider)
291
- if initial_url:
292
- tried_urls.add(initial_url)
293
- try:
294
- tts_config = await get_tts_config(initial_provider, voice, model, instructions)
295
-
296
- # Handle both new client object and legacy client_key
297
- if 'client' in tts_config:
298
- openai_clients['_temp_tts'] = tts_config['client']
299
- client_key = '_temp_tts'
300
- else:
301
- client_key = tts_config.get('client_key', 'tts')
302
-
303
- # Get conversation ID from logger
304
- conversation_logger = get_conversation_logger()
305
- conversation_id = conversation_logger.conversation_id
306
-
307
- success, tts_metrics = await text_to_speech(
308
- text=message,
309
- openai_clients=openai_clients,
310
- tts_model=tts_config['model'],
311
- tts_base_url=tts_config['base_url'],
312
- tts_voice=tts_config['voice'],
313
- debug=DEBUG,
314
- debug_dir=DEBUG_DIR if DEBUG else None,
315
- save_audio=SAVE_AUDIO,
316
- audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
317
- client_key=client_key,
318
- instructions=tts_config.get('instructions'),
319
- audio_format=audio_format,
320
- conversation_id=conversation_id,
321
- speed=speed
322
- )
323
-
324
- # Clean up temporary client
325
- if '_temp_tts' in openai_clients:
326
- del openai_clients['_temp_tts']
327
-
328
- if success:
329
- return success, tts_metrics, tts_config
330
-
331
- # Mark endpoint as unhealthy
332
- await provider_registry.mark_unhealthy('tts', tts_config['base_url'], 'TTS request failed')
333
-
334
- except Exception as e:
335
- last_error = str(e)
336
- logger.warning(f"Initial provider {initial_provider} failed: {e}")
337
- logger.debug(f"Full error details for {initial_provider}:", exc_info=True)
338
-
339
- # Try remaining endpoints in order
340
- from voice_mode.config import TTS_BASE_URLS
341
-
342
- for base_url in TTS_BASE_URLS:
343
- if base_url in tried_urls:
344
- continue
345
-
346
- tried_urls.add(base_url)
347
-
348
- try:
349
- # Try to get config for this specific base URL
350
- tts_config = await get_tts_config(None, voice, model, instructions)
351
-
352
- # Skip if we got a different URL than requested (means our preferred wasn't available)
353
- if tts_config.get('base_url') != base_url:
354
- continue
355
-
356
- # Handle both new client object and legacy client_key
357
- if 'client' in tts_config:
358
- openai_clients['_temp_tts'] = tts_config['client']
359
- client_key = '_temp_tts'
360
- else:
361
- client_key = tts_config.get('client_key', 'tts')
362
-
363
- # Get conversation ID from logger
364
- conversation_logger = get_conversation_logger()
365
- conversation_id = conversation_logger.conversation_id
366
-
367
- success, tts_metrics = await text_to_speech(
368
- text=message,
369
- openai_clients=openai_clients,
370
- tts_model=tts_config['model'],
371
- tts_base_url=tts_config['base_url'],
372
- tts_voice=tts_config['voice'],
373
- debug=DEBUG,
374
- debug_dir=DEBUG_DIR if DEBUG else None,
375
- save_audio=SAVE_AUDIO,
376
- audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
377
- client_key=client_key,
378
- instructions=tts_config.get('instructions'),
379
- audio_format=audio_format,
380
- conversation_id=conversation_id,
381
- speed=speed
382
- )
383
-
384
- # Clean up temporary client
385
- if '_temp_tts' in openai_clients:
386
- del openai_clients['_temp_tts']
387
-
388
- if success:
389
- logger.info(f"TTS succeeded with failover to: {base_url}")
390
- return success, tts_metrics, tts_config
391
- else:
392
- # Mark endpoint as unhealthy
393
- await provider_registry.mark_unhealthy('tts', base_url, 'TTS request failed')
394
-
395
- except Exception as e:
396
- last_error = str(e)
397
- logger.warning(f"TTS failed for {base_url}: {e}")
398
- # Mark endpoint as unhealthy
399
- await provider_registry.mark_unhealthy('tts', base_url, str(e))
400
-
401
- # All endpoints failed
402
- logger.error(f"All TTS endpoints failed. Last error: {last_error}")
403
-
404
- # Create a config dict with error information
405
- from voice_mode.config import TTS_BASE_URLS as CONFIG_TTS_BASE_URLS
406
- error_config = {
407
- 'error': last_error,
408
- 'tried_urls': list(tried_urls),
409
- 'base_url': CONFIG_TTS_BASE_URLS[0] if CONFIG_TTS_BASE_URLS else 'https://api.openai.com/v1'
410
- }
411
-
412
- return False, None, error_config
241
+
242
+ # Always use simple failover (the only mode now)
243
+ from voice_mode.simple_failover import simple_tts_failover
244
+ return await simple_tts_failover(
245
+ text=message,
246
+ voice=voice or TTS_VOICES[0],
247
+ model=model or TTS_MODELS[0],
248
+ instructions=instructions,
249
+ audio_format=audio_format,
250
+ debug=DEBUG,
251
+ debug_dir=DEBUG_DIR if DEBUG else None,
252
+ save_audio=SAVE_AUDIO,
253
+ audio_dir=AUDIO_DIR if SAVE_AUDIO else None,
254
+ speed=speed
255
+ )
413
256
 
414
257
 
415
258
  async def speech_to_text(audio_data: np.ndarray, save_audio: bool = False, audio_dir: Optional[Path] = None, transport: str = "local") -> Optional[str]:
@@ -430,122 +273,71 @@ async def speech_to_text_with_failover(
430
273
  Returns:
431
274
  Transcribed text or None if all endpoints fail
432
275
  """
433
- from voice_mode.config import SIMPLE_FAILOVER, STT_BASE_URLS
434
-
435
- # Use simple failover if enabled
436
- if SIMPLE_FAILOVER:
437
- import tempfile
438
- from voice_mode.conversation_logger import get_conversation_logger
439
- from voice_mode.core import save_debug_file, get_debug_filename
440
-
441
- # Determine if we should save the file permanently or use a temp file
442
- if save_audio and audio_dir:
443
- # Save directly to final location
444
- conversation_logger = get_conversation_logger()
445
- conversation_id = conversation_logger.conversation_id
446
-
447
- # Create year/month directory structure
448
- now = datetime.now()
449
- year_dir = audio_dir / str(now.year)
450
- month_dir = year_dir / f"{now.month:02d}"
451
- month_dir.mkdir(parents=True, exist_ok=True)
452
-
453
- # Generate filename and path
454
- filename = get_debug_filename("stt", "wav", conversation_id)
455
- wav_file_path = month_dir / filename
456
-
457
- # Write audio data directly to final location
458
- write(str(wav_file_path), SAMPLE_RATE, audio_data)
459
- logger.info(f"STT audio saved to: {wav_file_path}")
460
-
461
- # Use the saved file for STT
462
- with open(wav_file_path, 'rb') as audio_file:
276
+ # Always use simple failover (the only mode now)
277
+ import tempfile
278
+ from voice_mode.conversation_logger import get_conversation_logger
279
+ from voice_mode.core import save_debug_file, get_debug_filename
280
+
281
+ # Determine if we should save the file permanently or use a temp file
282
+ if save_audio and audio_dir:
283
+ # Save directly to final location
284
+ conversation_logger = get_conversation_logger()
285
+ conversation_id = conversation_logger.conversation_id
286
+
287
+ # Create year/month directory structure
288
+ now = datetime.now()
289
+ year_dir = audio_dir / str(now.year)
290
+ month_dir = year_dir / f"{now.month:02d}"
291
+ month_dir.mkdir(parents=True, exist_ok=True)
292
+
293
+ # Generate filename and path
294
+ filename = get_debug_filename("stt", "wav", conversation_id)
295
+ wav_file_path = month_dir / filename
296
+
297
+ # Write audio data directly to final location
298
+ write(str(wav_file_path), SAMPLE_RATE, audio_data)
299
+ logger.info(f"STT audio saved to: {wav_file_path}")
300
+
301
+ # Use the saved file for STT
302
+ with open(wav_file_path, 'rb') as audio_file:
303
+ from voice_mode.simple_failover import simple_stt_failover
304
+ stt_result = await simple_stt_failover(
305
+ audio_file=audio_file,
306
+ model="whisper-1"
307
+ )
308
+ # Extract text and log provider info
309
+ if isinstance(stt_result, dict):
310
+ result = stt_result.get("text")
311
+ provider = stt_result.get("provider", "unknown")
312
+ logger.info(f"STT Provider Used: {provider}")
313
+ else:
314
+ # Backward compatibility if old version
315
+ result = stt_result
316
+ # Don't delete - it's our saved audio file
317
+ else:
318
+ # Use temporary file that will be deleted
319
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
320
+ write(tmp_file.name, SAMPLE_RATE, audio_data)
321
+ tmp_file.flush()
322
+
323
+ with open(tmp_file.name, 'rb') as audio_file:
463
324
  from voice_mode.simple_failover import simple_stt_failover
464
- result = await simple_stt_failover(
325
+ stt_result = await simple_stt_failover(
465
326
  audio_file=audio_file,
466
327
  model="whisper-1"
467
328
  )
468
- # Don't delete - it's our saved audio file
469
- else:
470
- # Use temporary file that will be deleted
471
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
472
- write(tmp_file.name, SAMPLE_RATE, audio_data)
473
- tmp_file.flush()
474
-
475
- with open(tmp_file.name, 'rb') as audio_file:
476
- from voice_mode.simple_failover import simple_stt_failover
477
- result = await simple_stt_failover(
478
- audio_file=audio_file,
479
- model="whisper-1"
480
- )
481
-
482
- # Clean up temp file
483
- os.unlink(tmp_file.name)
484
-
485
- return result
486
-
487
- # Original implementation with health checks
488
- from voice_mode.provider_discovery import provider_registry
489
-
490
- # Track which URLs we've tried
491
- tried_urls = set()
492
- last_error = None
493
-
494
- # Try configured endpoints in order
495
- for base_url in STT_BASE_URLS:
496
- if base_url in tried_urls:
497
- continue
498
-
499
- tried_urls.add(base_url)
500
-
501
- try:
502
- # Get STT config for this specific endpoint
503
- client, selected_model, endpoint_info = await get_stt_client(base_url=base_url)
504
-
505
- if not client:
506
- logger.warning(f"No STT client available for {base_url}")
507
- continue
508
-
509
- from voice_mode.provider_discovery import detect_provider_type
510
-
511
- stt_config = {
512
- 'client': client,
513
- 'model': selected_model,
514
- 'base_url': endpoint_info.base_url if endpoint_info else base_url,
515
- 'provider': 'whisper-local' if '127.0.0.1' in base_url or 'localhost' in base_url else 'openai-whisper',
516
- 'provider_type': detect_provider_type(endpoint_info.base_url if endpoint_info else base_url)
517
- }
518
-
519
- logger.info(f"Attempting STT with {stt_config['provider']} at {stt_config['base_url']}")
520
-
521
- # Create openai_clients dict with temporary STT client
522
- openai_clients = {'_temp_stt': client}
523
-
524
- # Call original speech_to_text with this config
525
- result = await _speech_to_text_internal(
526
- audio_data,
527
- stt_config,
528
- openai_clients,
529
- save_audio,
530
- audio_dir
531
- )
532
-
533
- if result:
534
- logger.info(f"STT succeeded with {stt_config['provider']}")
535
- return result
536
- else:
537
- # Mark endpoint as unhealthy if it returned None
538
- await provider_registry.mark_unhealthy('stt', base_url, 'STT returned no result')
539
-
540
- except Exception as e:
541
- last_error = str(e)
542
- logger.warning(f"STT failed for {base_url}: {e}")
543
- # Mark endpoint as unhealthy
544
- await provider_registry.mark_unhealthy('stt', base_url, str(e))
545
-
546
- # All endpoints failed
547
- logger.error(f"All STT endpoints failed. Last error: {last_error}")
548
- return None
329
+ # Return dict with text and provider
330
+ if isinstance(stt_result, dict):
331
+ result = stt_result
332
+ logger.info(f"STT Provider Used: {stt_result.get('provider', 'unknown')}")
333
+ else:
334
+ # Backward compatibility - wrap in dict
335
+ result = {"text": stt_result, "provider": "unknown"} if stt_result else None
336
+
337
+ # Clean up temp file
338
+ os.unlink(tmp_file.name)
339
+
340
+ return result
549
341
 
550
342
 
551
343
  async def _speech_to_text_internal(
@@ -732,6 +524,8 @@ async def _speech_to_text_internal(
732
524
  provider_type=stt_config.get('provider_type'),
733
525
  audio_format=export_format, # Use actual format from conversion
734
526
  transport=transport,
527
+ is_fallback=stt_config.get('is_fallback', False),
528
+ fallback_reason=stt_config.get('fallback_reason'),
735
529
  silence_detection={
736
530
  "enabled": not DISABLE_SILENCE_DETECTION,
737
531
  "vad_aggressiveness": VAD_AGGRESSIVENESS,
@@ -1771,6 +1565,8 @@ async def converse(
1771
1565
  provider=tts_config.get('provider') if tts_config else (tts_provider if tts_provider else 'openai'),
1772
1566
  provider_url=tts_config.get('base_url') if tts_config else None,
1773
1567
  provider_type=tts_config.get('provider_type') if tts_config else None,
1568
+ is_fallback=tts_config.get('is_fallback', False) if tts_config else False,
1569
+ fallback_reason=tts_config.get('fallback_reason') if tts_config else None,
1774
1570
  timing=timing_str,
1775
1571
  audio_format=audio_format,
1776
1572
  transport="speak-only",
@@ -1901,6 +1697,8 @@ async def converse(
1901
1697
  provider=tts_config.get('provider') if tts_config else (tts_provider if tts_provider else 'openai'),
1902
1698
  provider_url=tts_config.get('base_url') if tts_config else None,
1903
1699
  provider_type=tts_config.get('provider_type') if tts_config else None,
1700
+ is_fallback=tts_config.get('is_fallback', False) if tts_config else False,
1701
+ fallback_reason=tts_config.get('fallback_reason') if tts_config else None,
1904
1702
  timing=tts_timing_str,
1905
1703
  audio_format=audio_format,
1906
1704
  transport=transport,
@@ -1997,8 +1795,19 @@ async def converse(
1997
1795
  event_logger.log_event(event_logger.STT_START)
1998
1796
 
1999
1797
  stt_start = time.perf_counter()
2000
- response_text = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
1798
+ stt_result = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
2001
1799
  timings['stt'] = time.perf_counter() - stt_start
1800
+
1801
+ # Extract text and provider from result
1802
+ if isinstance(stt_result, dict):
1803
+ response_text = stt_result.get("text")
1804
+ stt_provider = stt_result.get("provider", "unknown")
1805
+ if stt_provider != "unknown":
1806
+ logger.info(f"📡 STT Provider: {stt_provider}")
1807
+ else:
1808
+ # Backward compatibility
1809
+ response_text = stt_result
1810
+ stt_provider = "unknown"
2002
1811
 
2003
1812
  # Log STT complete
2004
1813
  if event_logger:
@@ -2110,7 +1919,9 @@ async def converse(
2110
1919
 
2111
1920
  # Logging already done immediately after TTS and STT complete
2112
1921
 
2113
- result = f"Voice response: {response_text} | Timing: {timing_str}"
1922
+ # Include STT provider in result if known
1923
+ stt_info = f" (STT: {stt_provider})" if 'stt_provider' in locals() and stt_provider != "unknown" else ""
1924
+ result = f"Voice response: {response_text}{stt_info} | Timing: {timing_str}"
2114
1925
  success = True
2115
1926
  else:
2116
1927
  result = f"No speech detected | Timing: {timing_str}"
@@ -34,7 +34,8 @@ async def voice_mode_info() -> str:
34
34
  for service_type in ["tts", "stt"]:
35
35
  info.append(f"\n{service_type.upper()} Endpoints:")
36
36
  for url, endpoint in provider_registry.registry[service_type].items():
37
- status = "✅" if endpoint.healthy else "❌"
37
+ # Show status based on whether endpoint has an error
38
+ status = "❌" if endpoint.last_error else "✅"
38
39
  info.append(f" {status} {url} ({endpoint.provider_type})")
39
40
  if service_type == "tts" and endpoint.voices:
40
41
  info.append(f" Voices: {', '.join(endpoint.voices[:3])}...")
@@ -7,14 +7,14 @@ from voice_mode.provider_discovery import provider_registry
7
7
  @mcp.tool()
8
8
  async def voice_registry() -> str:
9
9
  """Get the current voice provider registry showing all discovered endpoints.
10
-
10
+
11
11
  Returns a formatted view of all TTS and STT endpoints with their:
12
- - Health status
13
12
  - Available models
14
13
  - Available voices (TTS only)
15
- - Response times
16
- - Last health check time
17
-
14
+ - Provider type
15
+ - Last check time
16
+ - Any recent errors
17
+
18
18
  This allows the LLM to see what voice services are currently available.
19
19
  """
20
20
  # Ensure registry is initialized
@@ -31,36 +31,32 @@ async def voice_registry() -> str:
31
31
  lines.append("-" * 30)
32
32
 
33
33
  for url, info in registry_data["tts"].items():
34
- status = "" if info["healthy"] else ""
34
+ status = "" if info.get("last_error") else ""
35
35
  lines.append(f"\n{status} {url}")
36
-
37
- if info["healthy"]:
38
- lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
39
- lines.append(f" Voices: {', '.join(info['voices']) if info['voices'] else 'none detected'}")
40
- if info["response_time_ms"]:
41
- lines.append(f" Response Time: {info['response_time_ms']:.0f}ms")
42
- else:
43
- if info.get("error"):
44
- lines.append(f" Error: {info['error']}")
45
-
46
- lines.append(f" Last Check: {info['last_check']}")
36
+ lines.append(f" Provider: {info.get('provider_type', 'unknown')}")
37
+ lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
38
+ lines.append(f" Voices: {', '.join(info['voices']) if info['voices'] else 'none detected'}")
39
+
40
+ if info.get("last_error"):
41
+ lines.append(f" Last Error: {info['last_error']}")
42
+
43
+ if info.get('last_check'):
44
+ lines.append(f" Last Check: {info['last_check']}")
47
45
 
48
46
  # STT Endpoints
49
47
  lines.append("\n\nSTT Endpoints:")
50
48
  lines.append("-" * 30)
51
49
 
52
50
  for url, info in registry_data["stt"].items():
53
- status = "" if info["healthy"] else ""
51
+ status = "" if info.get("last_error") else ""
54
52
  lines.append(f"\n{status} {url}")
55
-
56
- if info["healthy"]:
57
- lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
58
- if info["response_time_ms"]:
59
- lines.append(f" Response Time: {info['response_time_ms']:.0f}ms")
60
- else:
61
- if info.get("error"):
62
- lines.append(f" Error: {info['error']}")
63
-
64
- lines.append(f" Last Check: {info['last_check']}")
53
+ lines.append(f" Provider: {info.get('provider_type', 'unknown')}")
54
+ lines.append(f" Models: {', '.join(info['models']) if info['models'] else 'none detected'}")
55
+
56
+ if info.get("last_error"):
57
+ lines.append(f" Last Error: {info['last_error']}")
58
+
59
+ if info.get('last_check'):
60
+ lines.append(f" Last Check: {info['last_check']}")
65
61
 
66
62
  return "\n".join(lines)