voice-mode 3.34.3__py3-none-any.whl → 4.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. voice_mode/__version__.py +1 -1
  2. voice_mode/cli.py +8 -0
  3. voice_mode/cli_commands/pronounce_commands.py +223 -0
  4. voice_mode/cli_commands/transcribe.py +141 -0
  5. voice_mode/config.py +139 -37
  6. voice_mode/data/default_pronunciation.yaml +268 -0
  7. voice_mode/frontend/.next/BUILD_ID +1 -0
  8. voice_mode/frontend/.next/app-build-manifest.json +28 -0
  9. voice_mode/frontend/.next/app-path-routes-manifest.json +1 -0
  10. voice_mode/frontend/.next/build-manifest.json +32 -0
  11. voice_mode/frontend/.next/export-marker.json +1 -0
  12. voice_mode/frontend/.next/images-manifest.json +1 -0
  13. voice_mode/frontend/.next/next-minimal-server.js.nft.json +1 -0
  14. voice_mode/frontend/.next/next-server.js.nft.json +1 -0
  15. voice_mode/frontend/.next/package.json +1 -0
  16. voice_mode/frontend/.next/prerender-manifest.json +1 -0
  17. voice_mode/frontend/.next/react-loadable-manifest.json +1 -0
  18. voice_mode/frontend/.next/required-server-files.json +1 -0
  19. voice_mode/frontend/.next/routes-manifest.json +1 -0
  20. voice_mode/frontend/.next/server/app/_not-found/page.js +1 -0
  21. voice_mode/frontend/.next/server/app/_not-found/page.js.nft.json +1 -0
  22. voice_mode/frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -0
  23. voice_mode/frontend/.next/server/app/_not-found.html +1 -0
  24. voice_mode/frontend/.next/server/app/_not-found.meta +6 -0
  25. voice_mode/frontend/.next/server/app/_not-found.rsc +9 -0
  26. voice_mode/frontend/.next/server/app/api/connection-details/route.js +12 -0
  27. voice_mode/frontend/.next/server/app/api/connection-details/route.js.nft.json +1 -0
  28. voice_mode/frontend/.next/server/app/favicon.ico/route.js +12 -0
  29. voice_mode/frontend/.next/server/app/favicon.ico/route.js.nft.json +1 -0
  30. voice_mode/frontend/.next/server/app/favicon.ico.body +0 -0
  31. voice_mode/frontend/.next/server/app/favicon.ico.meta +1 -0
  32. voice_mode/frontend/.next/server/app/index.html +1 -0
  33. voice_mode/frontend/.next/server/app/index.meta +5 -0
  34. voice_mode/frontend/.next/server/app/index.rsc +7 -0
  35. voice_mode/frontend/.next/server/app/page.js +11 -0
  36. voice_mode/frontend/.next/server/app/page.js.nft.json +1 -0
  37. voice_mode/frontend/.next/server/app/page_client-reference-manifest.js +1 -0
  38. voice_mode/frontend/.next/server/app-paths-manifest.json +6 -0
  39. voice_mode/frontend/.next/server/chunks/463.js +1 -0
  40. voice_mode/frontend/.next/server/chunks/682.js +6 -0
  41. voice_mode/frontend/.next/server/chunks/948.js +2 -0
  42. voice_mode/frontend/.next/server/chunks/994.js +2 -0
  43. voice_mode/frontend/.next/server/chunks/font-manifest.json +1 -0
  44. voice_mode/frontend/.next/server/font-manifest.json +1 -0
  45. voice_mode/frontend/.next/server/functions-config-manifest.json +1 -0
  46. voice_mode/frontend/.next/server/interception-route-rewrite-manifest.js +1 -0
  47. voice_mode/frontend/.next/server/middleware-build-manifest.js +1 -0
  48. voice_mode/frontend/.next/server/middleware-manifest.json +6 -0
  49. voice_mode/frontend/.next/server/middleware-react-loadable-manifest.js +1 -0
  50. voice_mode/frontend/.next/server/next-font-manifest.js +1 -0
  51. voice_mode/frontend/.next/server/next-font-manifest.json +1 -0
  52. voice_mode/frontend/.next/server/pages/404.html +1 -0
  53. voice_mode/frontend/.next/server/pages/500.html +1 -0
  54. voice_mode/frontend/.next/server/pages/_app.js +1 -0
  55. voice_mode/frontend/.next/server/pages/_app.js.nft.json +1 -0
  56. voice_mode/frontend/.next/server/pages/_document.js +1 -0
  57. voice_mode/frontend/.next/server/pages/_document.js.nft.json +1 -0
  58. voice_mode/frontend/.next/server/pages/_error.js +1 -0
  59. voice_mode/frontend/.next/server/pages/_error.js.nft.json +1 -0
  60. voice_mode/frontend/.next/server/pages-manifest.json +1 -0
  61. voice_mode/frontend/.next/server/server-reference-manifest.js +1 -0
  62. voice_mode/frontend/.next/server/server-reference-manifest.json +1 -0
  63. voice_mode/frontend/.next/server/webpack-runtime.js +1 -0
  64. voice_mode/frontend/.next/standalone/.next/BUILD_ID +1 -0
  65. voice_mode/frontend/.next/standalone/.next/app-build-manifest.json +28 -0
  66. voice_mode/frontend/.next/standalone/.next/app-path-routes-manifest.json +1 -0
  67. voice_mode/frontend/.next/standalone/.next/build-manifest.json +32 -0
  68. voice_mode/frontend/.next/standalone/.next/package.json +1 -0
  69. voice_mode/frontend/.next/standalone/.next/prerender-manifest.json +1 -0
  70. voice_mode/frontend/.next/standalone/.next/react-loadable-manifest.json +1 -0
  71. voice_mode/frontend/.next/standalone/.next/required-server-files.json +1 -0
  72. voice_mode/frontend/.next/standalone/.next/routes-manifest.json +1 -0
  73. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js +1 -0
  74. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js.nft.json +1 -0
  75. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page_client-reference-manifest.js +1 -0
  76. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.html +1 -0
  77. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.meta +6 -0
  78. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.rsc +9 -0
  79. voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js +12 -0
  80. voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js.nft.json +1 -0
  81. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js +12 -0
  82. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js.nft.json +1 -0
  83. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico.body +0 -0
  84. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico.meta +1 -0
  85. voice_mode/frontend/.next/standalone/.next/server/app/index.html +1 -0
  86. voice_mode/frontend/.next/standalone/.next/server/app/index.meta +5 -0
  87. voice_mode/frontend/.next/standalone/.next/server/app/index.rsc +7 -0
  88. voice_mode/frontend/.next/standalone/.next/server/app/page.js +11 -0
  89. voice_mode/frontend/.next/standalone/.next/server/app/page.js.nft.json +1 -0
  90. voice_mode/frontend/.next/standalone/.next/server/app/page_client-reference-manifest.js +1 -0
  91. voice_mode/frontend/.next/standalone/.next/server/app-paths-manifest.json +6 -0
  92. voice_mode/frontend/.next/standalone/.next/server/chunks/463.js +1 -0
  93. voice_mode/frontend/.next/standalone/.next/server/chunks/682.js +6 -0
  94. voice_mode/frontend/.next/standalone/.next/server/chunks/948.js +2 -0
  95. voice_mode/frontend/.next/standalone/.next/server/chunks/994.js +2 -0
  96. voice_mode/frontend/.next/standalone/.next/server/font-manifest.json +1 -0
  97. voice_mode/frontend/.next/standalone/.next/server/middleware-build-manifest.js +1 -0
  98. voice_mode/frontend/.next/standalone/.next/server/middleware-manifest.json +6 -0
  99. voice_mode/frontend/.next/standalone/.next/server/middleware-react-loadable-manifest.js +1 -0
  100. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.js +1 -0
  101. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.json +1 -0
  102. voice_mode/frontend/.next/standalone/.next/server/pages/404.html +1 -0
  103. voice_mode/frontend/.next/standalone/.next/server/pages/500.html +1 -0
  104. voice_mode/frontend/.next/standalone/.next/server/pages/_app.js +1 -0
  105. voice_mode/frontend/.next/standalone/.next/server/pages/_app.js.nft.json +1 -0
  106. voice_mode/frontend/.next/standalone/.next/server/pages/_document.js +1 -0
  107. voice_mode/frontend/.next/standalone/.next/server/pages/_document.js.nft.json +1 -0
  108. voice_mode/frontend/.next/standalone/.next/server/pages/_error.js +1 -0
  109. voice_mode/frontend/.next/standalone/.next/server/pages/_error.js.nft.json +1 -0
  110. voice_mode/frontend/.next/standalone/.next/server/pages-manifest.json +1 -0
  111. voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.js +1 -0
  112. voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.json +1 -0
  113. voice_mode/frontend/.next/standalone/.next/server/webpack-runtime.js +1 -0
  114. voice_mode/frontend/.next/standalone/package.json +40 -0
  115. voice_mode/frontend/.next/standalone/server.js +38 -0
  116. voice_mode/frontend/.next/static/chunks/117-40bc79a2b97edb21.js +2 -0
  117. voice_mode/frontend/.next/static/chunks/144d3bae-2d5f122b82426d88.js +1 -0
  118. voice_mode/frontend/.next/static/chunks/471-bd4b96a33883dfa2.js +3 -0
  119. voice_mode/frontend/.next/static/chunks/app/_not-found/page-5011050e402ab9c8.js +1 -0
  120. voice_mode/frontend/.next/static/chunks/app/layout-fcb9b9ba5b72c7fc.js +1 -0
  121. voice_mode/frontend/.next/static/chunks/app/page-7c7ec2ad413ace39.js +1 -0
  122. voice_mode/frontend/.next/static/chunks/fd9d1056-af324d327b243cf1.js +1 -0
  123. voice_mode/frontend/.next/static/chunks/framework-f66176bb897dc684.js +1 -0
  124. voice_mode/frontend/.next/static/chunks/main-3163eca598b76a9f.js +1 -0
  125. voice_mode/frontend/.next/static/chunks/main-app-d02bd38ac01adb8a.js +1 -0
  126. voice_mode/frontend/.next/static/chunks/pages/_app-72b849fbd24ac258.js +1 -0
  127. voice_mode/frontend/.next/static/chunks/pages/_error-7ba65e1336b92748.js +1 -0
  128. voice_mode/frontend/.next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  129. voice_mode/frontend/.next/static/chunks/webpack-0ea9b80f19935b70.js +1 -0
  130. voice_mode/frontend/.next/static/css/a2f49a47752b5010.css +3 -0
  131. voice_mode/frontend/.next/static/media/01099be941da1820-s.woff2 +0 -0
  132. voice_mode/frontend/.next/static/media/39883d31a7792467-s.p.woff2 +0 -0
  133. voice_mode/frontend/.next/static/media/6368404d2e8d66fe-s.woff2 +0 -0
  134. voice_mode/frontend/.next/static/pbDjheefW1LwCua_8mPoZ/_buildManifest.js +1 -0
  135. voice_mode/frontend/.next/static/pbDjheefW1LwCua_8mPoZ/_ssgManifest.js +1 -0
  136. voice_mode/frontend/.next/trace +43 -0
  137. voice_mode/frontend/.next/types/app/api/connection-details/route.ts +343 -0
  138. voice_mode/frontend/.next/types/app/layout.ts +79 -0
  139. voice_mode/frontend/.next/types/app/page.ts +79 -0
  140. voice_mode/frontend/.next/types/package.json +1 -0
  141. voice_mode/frontend/package-lock.json +154 -1
  142. voice_mode/pronounce.py +397 -0
  143. voice_mode/providers.py +7 -8
  144. voice_mode/resources/configuration.py +2 -2
  145. voice_mode/tools/configuration_management.py +106 -5
  146. voice_mode/tools/converse.py +109 -0
  147. voice_mode/tools/pronounce.py +245 -0
  148. voice_mode/tools/transcription/__init__.py +14 -0
  149. voice_mode/tools/transcription/backends.py +287 -0
  150. voice_mode/tools/transcription/core.py +136 -0
  151. voice_mode/tools/transcription/formats.py +144 -0
  152. voice_mode/tools/transcription/types.py +52 -0
  153. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/METADATA +5 -2
  154. voice_mode-4.1.0.dist-info/RECORD +259 -0
  155. voice_mode/voice_preferences.py +0 -125
  156. voice_mode-3.34.3.dist-info/RECORD +0 -116
  157. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/WHEEL +0 -0
  158. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/entry_points.txt +0 -0
@@ -5,7 +5,7 @@ import re
5
5
  from pathlib import Path
6
6
  from typing import Dict, Optional, List
7
7
  from voice_mode.server import mcp
8
- from voice_mode.config import BASE_DIR
8
+ from voice_mode.config import BASE_DIR, reload_configuration, find_voicemode_env_files
9
9
  import logging
10
10
 
11
11
  logger = logging.getLogger("voice-mode")
@@ -109,7 +109,7 @@ async def update_config(key: str, value: str) -> str:
109
109
  """Update a configuration value in the voicemode.env file.
110
110
 
111
111
  Args:
112
- key: The configuration key to update (e.g., 'VOICEMODE_TTS_VOICES')
112
+ key: The configuration key to update (e.g., 'VOICEMODE_VOICES')
113
113
  value: The new value for the configuration
114
114
 
115
115
  Returns:
@@ -175,7 +175,7 @@ async def list_config_keys() -> str:
175
175
  ("Provider Configuration", [
176
176
  ("VOICEMODE_TTS_BASE_URLS", "Comma-separated list of TTS endpoints"),
177
177
  ("VOICEMODE_STT_BASE_URLS", "Comma-separated list of STT endpoints"),
178
- ("VOICEMODE_TTS_VOICES", "Comma-separated list of preferred voices"),
178
+ ("VOICEMODE_VOICES", "Comma-separated list of preferred voices"),
179
179
  ("VOICEMODE_TTS_MODELS", "Comma-separated list of preferred models"),
180
180
  ("VOICEMODE_PREFER_LOCAL", "Prefer local providers over cloud (true/false)"),
181
181
  ("VOICEMODE_ALWAYS_TRY_LOCAL", "Always attempt local providers (true/false)"),
@@ -211,6 +211,107 @@ async def list_config_keys() -> str:
211
211
  lines.append(f" {description}")
212
212
  lines.append("")
213
213
 
214
- lines.append("💡 Usage: update_config(key='VOICEMODE_TTS_VOICES', value='af_sky,nova')")
214
+ lines.append("💡 Usage: update_config(key='VOICEMODE_VOICES', value='af_sky,nova')")
215
215
 
216
- return "\n".join(lines)
216
+ return "\n".join(lines)
217
+
218
+
219
+ @mcp.tool()
220
+ async def config_reload() -> str:
221
+ """Reload configuration from .voicemode.env files and clear all caches.
222
+
223
+ This tool reloads configuration from:
224
+ 1. Global ~/.voicemode/voicemode.env file
225
+ 2. Project-specific .voicemode.env files (searched up directory tree)
226
+ 3. Environment variables (highest priority)
227
+
228
+ Returns:
229
+ Status message showing which files were loaded and any changes
230
+ """
231
+ try:
232
+ # Get config files before reload
233
+ old_files = find_voicemode_env_files()
234
+
235
+ # Reload configuration
236
+ reload_configuration()
237
+
238
+ # Get config files after reload
239
+ new_files = find_voicemode_env_files()
240
+
241
+ lines = ["✅ Configuration reloaded successfully!", ""]
242
+
243
+ if new_files:
244
+ lines.append("📁 Configuration files loaded (in order):")
245
+ for i, config_file in enumerate(new_files, 1):
246
+ lines.append(f" {i}. {config_file}")
247
+ else:
248
+ lines.append("📁 No configuration files found - using defaults")
249
+
250
+ lines.append("")
251
+ lines.append("🔄 All caches have been cleared")
252
+ lines.append("📊 Voice preferences and provider settings updated")
253
+
254
+ logger.info(f"Configuration reloaded from {len(new_files)} files")
255
+
256
+ return "\n".join(lines)
257
+
258
+ except Exception as e:
259
+ logger.error(f"Failed to reload configuration: {e}")
260
+ return f"❌ Failed to reload configuration: {str(e)}"
261
+
262
+
263
+ @mcp.tool()
264
+ async def show_config_files() -> str:
265
+ """Show which .voicemode.env files are being used for configuration.
266
+
267
+ This shows the current configuration file discovery and loading order:
268
+ - Global configuration from ~/.voicemode/voicemode.env
269
+ - Project-specific configuration (searched up directory tree)
270
+ - Current working directory for context
271
+
272
+ Returns:
273
+ Formatted list of configuration files and their status
274
+ """
275
+ try:
276
+ config_files = find_voicemode_env_files()
277
+
278
+ lines = ["📋 Voice Mode Configuration Files", "=" * 40, ""]
279
+ lines.append(f"🗂️ Current directory: {Path.cwd()}")
280
+ lines.append("")
281
+
282
+ if config_files:
283
+ lines.append("📁 Configuration files (loading order):")
284
+ lines.append("")
285
+
286
+ for i, config_file in enumerate(config_files, 1):
287
+ status = "✅ EXISTS" if config_file.exists() else "❌ MISSING"
288
+ file_type = ""
289
+
290
+ if config_file.name == "voicemode.env" and config_file.parent.name == ".voicemode":
291
+ if config_file.parent == Path.home() / ".voicemode":
292
+ file_type = " (Global)"
293
+ else:
294
+ file_type = " (Project - in .voicemode dir)"
295
+ elif config_file.name == ".voicemode.env":
296
+ if config_file.parent == Path.cwd():
297
+ file_type = " (Project - current dir)"
298
+ else:
299
+ file_type = " (Project - parent dir)"
300
+
301
+ lines.append(f" {i}. {config_file}{file_type}")
302
+ lines.append(f" {status}")
303
+ lines.append("")
304
+ else:
305
+ lines.append("❌ No configuration files found")
306
+ lines.append("")
307
+ lines.append("💡 Tip: Create ~/.voicemode/voicemode.env for global configuration")
308
+ lines.append("💡 Tip: Create .voicemode.env in project directories for project-specific settings")
309
+
310
+ lines.append("")
311
+ lines.append("🔄 Use reload_config() to reload after making changes")
312
+
313
+ return "\n".join(lines)
314
+
315
+ except Exception as e:
316
+ logger.error(f"Failed to show config files: {e}")
317
+ return f"❌ Failed to show config files: {str(e)}"
@@ -85,6 +85,7 @@ from voice_mode.utils import (
85
85
  log_tool_request_start,
86
86
  log_tool_request_end
87
87
  )
88
+ from voice_mode.pronounce import get_manager as get_pronounce_manager, is_enabled as pronounce_enabled
88
89
 
89
90
  logger = logging.getLogger("voice-mode")
90
91
 
@@ -255,6 +256,11 @@ async def text_to_speech_with_failover(
255
256
  """
256
257
  from voice_mode.config import SIMPLE_FAILOVER
257
258
 
259
+ # Apply pronunciation rules if enabled
260
+ if pronounce_enabled():
261
+ pronounce_mgr = get_pronounce_manager()
262
+ message = pronounce_mgr.process_tts(message)
263
+
258
264
  # Use simple failover if enabled
259
265
  if SIMPLE_FAILOVER:
260
266
  from voice_mode.simple_failover import simple_tts_failover
@@ -695,6 +701,11 @@ async def _speech_to_text_internal(
695
701
  logger.debug(f"STT API response type: {type(transcription)}")
696
702
  text = transcription.strip() if isinstance(transcription, str) else transcription.text.strip()
697
703
 
704
+ # Apply pronunciation rules if enabled
705
+ if text and pronounce_enabled():
706
+ pronounce_mgr = get_pronounce_manager()
707
+ text = pronounce_mgr.process_stt(text)
708
+
698
709
  if text:
699
710
  logger.info(f"✓ STT result: '{text}'")
700
711
 
@@ -875,6 +886,45 @@ def record_audio(duration: float) -> np.ndarray:
875
886
  logger.error(f"Recording failed: {e}")
876
887
  logger.error(f"Audio config when error occurred - Sample rate: {SAMPLE_RATE}, Channels: {CHANNELS}")
877
888
 
889
+ # Check if this is a device error that might be recoverable
890
+ error_str = str(e).lower()
891
+ if any(err in error_str for err in ['device unavailable', 'device disconnected',
892
+ 'invalid device', 'unanticipated host error',
893
+ 'portaudio error']):
894
+ logger.info("Audio device error detected - attempting to reinitialize audio system")
895
+
896
+ # Try to reinitialize sounddevice
897
+ try:
898
+ # Get current default device info before reinit
899
+ try:
900
+ old_device = sd.query_devices(kind='input')
901
+ old_device_name = old_device.get('name', 'Unknown')
902
+ except:
903
+ old_device_name = 'Previous device'
904
+
905
+ sd._terminate()
906
+ sd._initialize()
907
+
908
+ # Get new default device info
909
+ try:
910
+ new_device = sd.query_devices(kind='input')
911
+ new_device_name = new_device.get('name', 'Unknown')
912
+ logger.info(f"Audio system reinitialized - switched from '{old_device_name}' to '{new_device_name}'")
913
+ except:
914
+ logger.info("Audio system reinitialized - retrying with new default device")
915
+
916
+ # Wait a moment for the system to stabilize
917
+ import time as time_module
918
+ time_module.sleep(0.5)
919
+
920
+ # Try recording again with the new device (recursive call)
921
+ logger.info("Retrying recording with new audio device...")
922
+ return record_audio(duration)
923
+
924
+ except Exception as reinit_error:
925
+ logger.error(f"Failed to reinitialize audio: {reinit_error}")
926
+ # Fall through to normal error handling
927
+
878
928
  # Import here to avoid circular imports
879
929
  from voice_mode.utils.audio_diagnostics import get_audio_error_help
880
930
 
@@ -989,6 +1039,14 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
989
1039
  """Callback for continuous audio stream"""
990
1040
  if status:
991
1041
  logger.warning(f"Audio stream status: {status}")
1042
+ # Check for device-related errors
1043
+ status_str = str(status).lower()
1044
+ if any(err in status_str for err in ['device unavailable', 'device disconnected',
1045
+ 'invalid device', 'unanticipated host error',
1046
+ 'stream is stopped', 'portaudio error']):
1047
+ # Signal that we should stop recording due to device error
1048
+ audio_queue.put(None) # Sentinel value to indicate error
1049
+ return
992
1050
  # Put the audio data in the queue for processing
993
1051
  audio_queue.put(indata.copy())
994
1052
 
@@ -1007,6 +1065,12 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1007
1065
  # Get audio chunk from queue with timeout
1008
1066
  chunk = audio_queue.get(timeout=0.1)
1009
1067
 
1068
+ # Check for error sentinel
1069
+ if chunk is None:
1070
+ logger.error("Audio device error detected - stopping recording")
1071
+ # Raise an exception to trigger recovery logic
1072
+ raise sd.PortAudioError("Audio device disconnected or unavailable")
1073
+
1010
1074
  # Flatten for consistency
1011
1075
  chunk_flat = chunk.flatten()
1012
1076
  chunks.append(chunk_flat)
@@ -1109,6 +1173,45 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1109
1173
  # Import here to avoid circular imports
1110
1174
  from voice_mode.utils.audio_diagnostics import get_audio_error_help
1111
1175
 
1176
+ # Check if this is a device error that might be recoverable
1177
+ error_str = str(e).lower()
1178
+ if any(err in error_str for err in ['device unavailable', 'device disconnected',
1179
+ 'invalid device', 'unanticipated host error',
1180
+ 'portaudio error']):
1181
+ logger.info("Audio device error detected - attempting to reinitialize audio system")
1182
+
1183
+ # Try to reinitialize sounddevice
1184
+ try:
1185
+ # Get current default device info before reinit
1186
+ try:
1187
+ old_device = sd.query_devices(kind='input')
1188
+ old_device_name = old_device.get('name', 'Unknown')
1189
+ except:
1190
+ old_device_name = 'Previous device'
1191
+
1192
+ sd._terminate()
1193
+ sd._initialize()
1194
+
1195
+ # Get new default device info
1196
+ try:
1197
+ new_device = sd.query_devices(kind='input')
1198
+ new_device_name = new_device.get('name', 'Unknown')
1199
+ logger.info(f"Audio system reinitialized - switched from '{old_device_name}' to '{new_device_name}'")
1200
+ except:
1201
+ logger.info("Audio system reinitialized - retrying with new default device")
1202
+
1203
+ # Wait a moment for the system to stabilize
1204
+ import time as time_module
1205
+ time_module.sleep(0.5)
1206
+
1207
+ # Try recording again with the new device (recursive call in sync context)
1208
+ logger.info("Retrying recording with new audio device...")
1209
+ return record_audio_with_silence_detection(max_duration, disable_silence_detection, min_duration, vad_aggressiveness)
1210
+
1211
+ except Exception as reinit_error:
1212
+ logger.error(f"Failed to reinitialize audio: {reinit_error}")
1213
+ # Fall through to normal error handling
1214
+
1112
1215
  # Get helpful error message
1113
1216
  help_message = get_audio_error_help(e)
1114
1217
  logger.error(f"\n{help_message}")
@@ -1555,6 +1658,12 @@ async def converse(
1555
1658
  # Run startup initialization if needed
1556
1659
  await startup_initialization()
1557
1660
 
1661
+ # Refresh audio device cache to pick up any device changes (AirPods, etc.)
1662
+ # This takes ~1ms and ensures we use the current default device
1663
+ import sounddevice as sd
1664
+ sd._terminate()
1665
+ sd._initialize()
1666
+
1558
1667
  # Get event logger and start session
1559
1668
  event_logger = get_event_logger()
1560
1669
  session_id = None
@@ -0,0 +1,245 @@
1
+ """MCP tools for managing pronunciation rules."""
2
+
3
+ import json
4
+ import yaml
5
+ from typing import Optional, Literal, List, Dict
6
+
7
+ from voice_mode.server import mcp
8
+ from voice_mode.pronounce import get_manager, is_enabled
9
+
10
+
11
+ @mcp.tool()
12
+ async def pronounce(
13
+ action: Literal["list", "add", "remove", "enable", "disable", "test", "reload"],
14
+ pattern: Optional[str] = None,
15
+ replacement: Optional[str] = None,
16
+ rule_type: Literal["tts", "stt"] = "tts",
17
+ description: Optional[str] = None,
18
+ name: Optional[str] = None,
19
+ test_text: Optional[str] = None
20
+ ) -> str:
21
+ """
22
+ Manage pronunciation rules for TTS/STT text processing.
23
+
24
+ This tool allows managing pronunciation rules that improve TTS pronunciation
25
+ and correct STT transcription errors. Rules are applied automatically when
26
+ text is processed.
27
+
28
+ Actions:
29
+ - list: Show all non-private rules (returns count of private rules)
30
+ - add: Add a new rule (requires pattern, replacement, rule_type)
31
+ - remove: Remove a rule by name (requires name, rule_type)
32
+ - enable: Enable a disabled rule (requires name, rule_type)
33
+ - disable: Disable an enabled rule (requires name, rule_type)
34
+ - test: Test rules on text (requires test_text, rule_type)
35
+ - reload: Reload rules from configuration files
36
+
37
+ Examples:
38
+ - List all TTS rules:
39
+ pronunciation_rules(action="list", rule_type="tts")
40
+
41
+ - Add a rule to pronounce "3M" correctly:
42
+ pronunciation_rules(
43
+ action="add",
44
+ pattern=r"\b3M\b",
45
+ replacement="three em",
46
+ rule_type="tts",
47
+ description="Pronounce 3M company name"
48
+ )
49
+
50
+ - Test how text would be pronounced:
51
+ pronunciation_rules(
52
+ action="test",
53
+ test_text="I work at 3M",
54
+ rule_type="tts"
55
+ )
56
+
57
+ - Correct common Whisper mishearing:
58
+ pronunciation_rules(
59
+ action="add",
60
+ pattern="me tool",
61
+ replacement="metool",
62
+ rule_type="stt",
63
+ description="Correct 'me tool' to 'metool'"
64
+ )
65
+
66
+ Args:
67
+ action: The action to perform
68
+ pattern: Regex pattern for add action
69
+ replacement: Replacement text for add action
70
+ rule_type: Type of rule (tts for text-to-speech, stt for speech-to-text)
71
+ description: Human-readable description for add action
72
+ name: Rule name for remove/enable/disable actions
73
+ test_text: Text to test for test action
74
+
75
+ Returns:
76
+ Result of the action as a formatted string
77
+ """
78
+ manager = get_manager()
79
+
80
+ if action == "list":
81
+ # List rules (excluding private ones)
82
+ all_rules = manager.list_rules(include_private=True)
83
+ public_rules = manager.list_rules(include_private=False)
84
+
85
+ # Filter by type if specified
86
+ if rule_type:
87
+ public_rules = [r for r in public_rules if r['direction'] == rule_type]
88
+ all_rules = [r for r in all_rules if r['direction'] == rule_type]
89
+
90
+ # Format the response
91
+ if not public_rules:
92
+ private_count = len(all_rules)
93
+ if private_count > 0:
94
+ return f"No public {rule_type} rules found. ({private_count} private rules hidden)"
95
+ else:
96
+ return f"No {rule_type} rules found."
97
+
98
+ # Build response
99
+ result = f"Pronunciation Rules ({rule_type.upper()}):\n\n"
100
+
101
+ for rule in public_rules:
102
+ status = "✓" if rule['enabled'] else "✗"
103
+ result += f"{status} {rule['name']}: \n"
104
+ result += f" Pattern: {rule['pattern']}\n"
105
+ result += f" Replace: {rule['replacement']}\n"
106
+ if rule['description']:
107
+ result += f" Desc: {rule['description']}\n"
108
+ result += "\n"
109
+
110
+ # Add private rule count if any
111
+ private_count = len(all_rules) - len(public_rules)
112
+ if private_count > 0:
113
+ result += f"({private_count} private rules hidden from view)\n"
114
+
115
+ return result
116
+
117
+ elif action == "add":
118
+ if not pattern or not replacement:
119
+ return "Error: 'add' action requires pattern and replacement"
120
+
121
+ success = manager.add_rule(
122
+ direction=rule_type,
123
+ pattern=pattern,
124
+ replacement=replacement,
125
+ name=name,
126
+ description=description or "",
127
+ enabled=True,
128
+ private=False # MCP-created rules are public
129
+ )
130
+
131
+ if success:
132
+ return f"✓ Rule added successfully for {rule_type.upper()}"
133
+ else:
134
+ return "✗ Failed to add rule. Check if the regex pattern is valid."
135
+
136
+ elif action == "remove":
137
+ if not name:
138
+ return "Error: 'remove' action requires rule name"
139
+
140
+ success = manager.remove_rule(rule_type, name)
141
+
142
+ if success:
143
+ return f"✓ Rule '{name}' removed from {rule_type.upper()}"
144
+ else:
145
+ return f"✗ Rule '{name}' not found in {rule_type.upper()} rules (may be private)"
146
+
147
+ elif action == "enable":
148
+ if not name:
149
+ return "Error: 'enable' action requires rule name"
150
+
151
+ success = manager.enable_rule(rule_type, name)
152
+
153
+ if success:
154
+ return f"✓ Rule '{name}' enabled in {rule_type.upper()}"
155
+ else:
156
+ return f"✗ Failed to enable rule '{name}' (not found or private)"
157
+
158
+ elif action == "disable":
159
+ if not name:
160
+ return "Error: 'disable' action requires rule name"
161
+
162
+ success = manager.disable_rule(rule_type, name)
163
+
164
+ if success:
165
+ return f"✓ Rule '{name}' disabled in {rule_type.upper()}"
166
+ else:
167
+ return f"✗ Failed to disable rule '{name}' (not found or private)"
168
+
169
+ elif action == "test":
170
+ if not test_text:
171
+ return "Error: 'test' action requires test_text"
172
+
173
+ result = manager.test_rule(test_text, rule_type)
174
+
175
+ if test_text != result:
176
+ return f"Original: {test_text}\nModified: {result}\n\nRules were applied to transform the text."
177
+ else:
178
+ return f"No changes: {test_text}\n\nNo rules matched or all rules are disabled."
179
+
180
+ elif action == "reload":
181
+ manager.reload_rules()
182
+
183
+ # Get counts
184
+ all_rules = manager.list_rules(include_private=True)
185
+ tts_count = len([r for r in all_rules if r['direction'] == 'tts'])
186
+ stt_count = len([r for r in all_rules if r['direction'] == 'stt'])
187
+
188
+ return f"✓ Pronunciation rules reloaded\nLoaded {tts_count} TTS rules and {stt_count} STT rules"
189
+
190
+ else:
191
+ return f"Error: Unknown action '{action}'. Use: list, add, remove, enable, disable, test, reload"
192
+
193
+
194
+ @mcp.tool()
195
+ async def pronounce_status() -> str:
196
+ """
197
+ Get the status of the pronunciation middleware.
198
+
199
+ Shows whether pronunciation processing is enabled and provides
200
+ statistics about loaded rules.
201
+
202
+ Returns:
203
+ Status information as a formatted string
204
+ """
205
+ enabled = is_enabled()
206
+ manager = get_manager()
207
+
208
+ # Get rule counts
209
+ all_rules = manager.list_rules(include_private=True)
210
+ public_rules = manager.list_rules(include_private=False)
211
+
212
+ tts_all = len([r for r in all_rules if r['direction'] == 'tts'])
213
+ tts_public = len([r for r in public_rules if r['direction'] == 'tts'])
214
+ tts_enabled = len([r for r in all_rules if r['direction'] == 'tts' and r['enabled']])
215
+
216
+ stt_all = len([r for r in all_rules if r['direction'] == 'stt'])
217
+ stt_public = len([r for r in public_rules if r['direction'] == 'stt'])
218
+ stt_enabled = len([r for r in all_rules if r['direction'] == 'stt' and r['enabled']])
219
+
220
+ status = f"Pronunciation Middleware Status:\n"
221
+ status += f"{'='*40}\n"
222
+ status += f"Enabled: {'✓ Yes' if enabled else '✗ No'}\n\n"
223
+
224
+ status += f"TTS Rules:\n"
225
+ status += f" Total: {tts_all} ({tts_public} public, {tts_all - tts_public} private)\n"
226
+ status += f" Enabled: {tts_enabled}\n\n"
227
+
228
+ status += f"STT Rules:\n"
229
+ status += f" Total: {stt_all} ({stt_public} public, {stt_all - stt_public} private)\n"
230
+ status += f" Enabled: {stt_enabled}\n\n"
231
+
232
+ status += f"Configuration:\n"
233
+ import os
234
+ log_enabled = os.environ.get('VOICEMODE_PRONUNCIATION_LOG_SUBSTITUTIONS', '').lower() == 'true'
235
+ private_mode = os.environ.get('VOICEMODE_PRONUNCIATION_PRIVATE_MODE', '').lower() == 'true'
236
+
237
+ status += f" Logging: {'✓ Enabled' if log_enabled else '✗ Disabled'}\n"
238
+ status += f" Private Mode: {'✓ All rules private' if private_mode else '✗ Normal'}\n"
239
+
240
+ # Show config file paths
241
+ status += f"\nConfiguration Files:\n"
242
+ for path in manager.config_paths:
243
+ status += f" - {path}\n"
244
+
245
+ return status
@@ -0,0 +1,14 @@
1
+ """Audio transcription with word-level timestamps."""
2
+
3
+ from .types import TranscriptionBackend, OutputFormat, TranscriptionResult, WordData, SegmentData
4
+ from .core import transcribe_audio, transcribe_audio_sync
5
+
6
+ __all__ = [
7
+ 'transcribe_audio',
8
+ 'transcribe_audio_sync',
9
+ 'TranscriptionBackend',
10
+ 'OutputFormat',
11
+ 'TranscriptionResult',
12
+ 'WordData',
13
+ 'SegmentData',
14
+ ]