voice-mode 2.22.3__tar.gz → 2.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. {voice_mode-2.22.3 → voice_mode-2.24.0}/CHANGELOG.md +29 -1
  2. {voice_mode-2.22.3 → voice_mode-2.24.0}/PKG-INFO +1 -1
  3. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/__version__.py +1 -1
  4. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/config.py +4 -0
  5. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/converse.py +161 -59
  6. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/service.py +11 -5
  7. voice_mode-2.24.0/voice_mode/utils/services/common.py +80 -0
  8. voice_mode-2.22.3/voice_mode/utils/services/common.py +0 -22
  9. {voice_mode-2.22.3 → voice_mode-2.24.0}/.gitignore +0 -0
  10. {voice_mode-2.22.3 → voice_mode-2.24.0}/README.md +0 -0
  11. {voice_mode-2.22.3 → voice_mode-2.24.0}/build_hooks.py +0 -0
  12. {voice_mode-2.22.3 → voice_mode-2.24.0}/pyproject.toml +0 -0
  13. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/__init__.py +0 -0
  14. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/__main__.py +0 -0
  15. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/cli.py +0 -0
  16. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/cli_commands/__init__.py +0 -0
  17. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/cli_commands/exchanges.py +0 -0
  18. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/conversation_logger.py +0 -0
  19. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/core.py +0 -0
  20. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/data/versions.json +0 -0
  21. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/__init__.py +0 -0
  22. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/conversations.py +0 -0
  23. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/filters.py +0 -0
  24. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/formatters.py +0 -0
  25. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/models.py +0 -0
  26. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/reader.py +0 -0
  27. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/exchanges/stats.py +0 -0
  28. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/README.md +0 -0
  29. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/app/api/connection-details/route.ts +0 -0
  30. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/app/favicon.ico +0 -0
  31. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/app/globals.css +0 -0
  32. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/app/layout.tsx +0 -0
  33. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/app/page.tsx +0 -0
  34. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/components/CloseIcon.tsx +0 -0
  35. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/components/NoAgentNotification.tsx +0 -0
  36. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/components/TranscriptionView.tsx +0 -0
  37. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/hooks/useCombinedTranscriptions.ts +0 -0
  38. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/hooks/useLocalMicTrack.ts +0 -0
  39. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/next-env.d.ts +0 -0
  40. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/next.config.mjs +0 -0
  41. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/package-lock.json +0 -0
  42. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/package.json +0 -0
  43. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/pnpm-lock.yaml +0 -0
  44. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/postcss.config.mjs +0 -0
  45. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/tailwind.config.ts +0 -0
  46. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/frontend/tsconfig.json +0 -0
  47. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/prompts/README.md +0 -0
  48. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/prompts/__init__.py +0 -0
  49. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/prompts/converse.py +0 -0
  50. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/prompts/release_notes.py +0 -0
  51. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/prompts/services.py +0 -0
  52. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/provider_discovery.py +0 -0
  53. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/providers.py +0 -0
  54. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/__init__.py +0 -0
  55. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/audio_files.py +0 -0
  56. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/changelog.py +0 -0
  57. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/configuration.py +0 -0
  58. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/statistics.py +0 -0
  59. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/version.py +0 -0
  60. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/resources/whisper_models.py +0 -0
  61. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/server.py +0 -0
  62. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/shared.py +0 -0
  63. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/simple_failover.py +0 -0
  64. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/statistics.py +0 -0
  65. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/streaming.py +0 -0
  66. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.frontend.plist +0 -0
  67. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.kokoro.plist +0 -0
  68. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.livekit.plist +0 -0
  69. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.whisper.plist +0 -0
  70. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/start-kokoro-with-health-check.sh +0 -0
  71. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/launchd/start-whisper-with-health-check.sh +0 -0
  72. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-frontend.service +0 -0
  73. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-kokoro.service +0 -0
  74. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-livekit.service +0 -0
  75. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-whisper.service +0 -0
  76. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/__init__.py +0 -0
  77. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/configuration_management.py +0 -0
  78. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/dependencies.py +0 -0
  79. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/devices.py +0 -0
  80. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/diagnostics.py +0 -0
  81. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/providers.py +0 -0
  82. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/kokoro/install.py +0 -0
  83. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/kokoro/uninstall.py +0 -0
  84. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/list_versions.py +0 -0
  85. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/__init__.py +0 -0
  86. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/frontend.py +0 -0
  87. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/install.py +0 -0
  88. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/production_server.py +0 -0
  89. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/uninstall.py +0 -0
  90. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/version_info.py +0 -0
  91. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/download_model.py +0 -0
  92. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/install.py +0 -0
  93. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/uninstall.py +0 -0
  94. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/statistics.py +0 -0
  95. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/tools/voice_registry.py +0 -0
  96. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/__init__.py +0 -0
  97. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/audio_diagnostics.py +0 -0
  98. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/event_logger.py +0 -0
  99. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/ffmpeg_check.py +0 -0
  100. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/format_migration.py +0 -0
  101. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/gpu_detection.py +0 -0
  102. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/migration_helpers.py +0 -0
  103. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/services/kokoro_helpers.py +0 -0
  104. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/services/livekit_helpers.py +0 -0
  105. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/services/whisper_helpers.py +0 -0
  106. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/utils/version_helpers.py +0 -0
  107. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/version.py +0 -0
  108. {voice_mode-2.22.3 → voice_mode-2.24.0}/voice_mode/voice_preferences.py +0 -0
@@ -7,10 +7,38 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [2.22.3] - 2025-08-16
10
+ ## [2.24.0] - 2025-08-16
11
+
12
+ ### Added
13
+ - **VAD debugging mode** - Comprehensive debugging for Voice Activity Detection
14
+ - New `VOICEMODE_VAD_DEBUG` environment variable enables detailed VAD logging
15
+ - Shows real-time speech detection decisions, state transitions, and timing
16
+ - Helps diagnose issues where recording stops before speech or cuts off early
17
+ - Added test script `scripts/test-vad-enhancement.py` for VAD testing
18
+ - Documented in `docs/vad-debugging.md` with common issues and solutions
11
19
 
12
20
  ## [2.23.0] - 2025-08-16
13
21
 
22
+ ### Added
23
+ - **`skip_tts` parameter** - Dynamic control over text-to-speech in converse tool
24
+ - Add optional `skip_tts` parameter to override global `VOICEMODE_SKIP_TTS` setting
25
+ - When `True`: Skip TTS for faster text-only responses
26
+ - When `False`: Always use TTS regardless of environment setting
27
+ - When `None` (default): Follow `VOICEMODE_SKIP_TTS` environment variable
28
+ - Enables LLM to intelligently choose between voice and text-only responses
29
+ - **`VOICEMODE_SKIP_TTS` environment variable** - Global TTS skip configuration
30
+ - Set to `true` for permanent text-only mode (faster responses)
31
+ - Can be overridden per-call with `skip_tts` parameter
32
+ - Useful for rapid development iterations or when voice isn't needed
33
+
34
+ ### Fixed
35
+ - **Service status detection** - Correctly identify SSH-forwarded vs locally running services
36
+ - SSH processes listening on service ports are now recognized as port forwards
37
+ - Status command now shows 🔄 for forwarded services vs ✅ for local services
38
+ - Prevents confusion about where services are actually running
39
+
40
+ ## [2.22.3] - 2025-08-16
41
+
14
42
  ### Fixed
15
43
  - **Service auto-enable error** - Fix 'FunctionTool' object is not callable
16
44
  - Changed whisper and kokoro installers to use `enable_service` function instead of MCP tool
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: voice-mode
3
- Version: 2.22.3
3
+ Version: 2.24.0
4
4
  Summary: VoiceMode - Voice interaction capabilities for AI assistants (formerly voice-mcp)
5
5
  Project-URL: Homepage, https://github.com/mbailey/voicemode
6
6
  Project-URL: Repository, https://github.com/mbailey/voicemode
@@ -1,3 +1,3 @@
1
1
  # This file is automatically updated by 'make release'
2
2
  # Do not edit manually
3
- __version__ = "2.22.3"
3
+ __version__ = "2.24.0"
@@ -162,6 +162,7 @@ MODELS_DIR = Path(os.getenv("VOICEMODE_MODELS_DIR", str(BASE_DIR / "models")))
162
162
  # Debug configuration
163
163
  DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() in ("true", "1", "yes", "on")
164
164
  TRACE_DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() == "trace"
165
+ VAD_DEBUG = os.getenv("VOICEMODE_VAD_DEBUG", "").lower() in ("true", "1", "yes", "on")
165
166
  DEBUG_DIR = LOGS_DIR / "debug" # Debug files now go under logs
166
167
 
167
168
  # Master save-all configuration
@@ -175,6 +176,9 @@ SAVE_TRANSCRIPTIONS = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_TRANSCRIPTI
175
176
  # Audio feedback configuration
176
177
  AUDIO_FEEDBACK_ENABLED = os.getenv("VOICEMODE_AUDIO_FEEDBACK", "true").lower() in ("true", "1", "yes", "on")
177
178
 
179
+ # Skip TTS configuration (skip text-to-speech for faster responses)
180
+ SKIP_TTS = os.getenv("VOICEMODE_SKIP_TTS", "false").lower() in ("true", "1", "yes", "on")
181
+
178
182
  # Local provider preference configuration
179
183
  PREFER_LOCAL = os.getenv("VOICEMODE_PREFER_LOCAL", "true").lower() in ("true", "1", "yes", "on")
180
184
 
@@ -32,6 +32,7 @@ from voice_mode.config import (
32
32
  CHANNELS,
33
33
  DEBUG,
34
34
  DEBUG_DIR,
35
+ VAD_DEBUG,
35
36
  SAVE_AUDIO,
36
37
  AUDIO_DIR,
37
38
  OPENAI_API_KEY,
@@ -48,6 +49,7 @@ from voice_mode.config import (
48
49
  VAD_AGGRESSIVENESS,
49
50
  SILENCE_THRESHOLD_MS,
50
51
  MIN_RECORDING_DURATION,
52
+ SKIP_TTS,
51
53
  VAD_CHUNK_DURATION_MS,
52
54
  INITIAL_SILENCE_GRACE_PERIOD,
53
55
  DEFAULT_LISTEN_DURATION,
@@ -871,7 +873,7 @@ def record_audio(duration: float) -> np.ndarray:
871
873
  sys.stderr = original_stderr
872
874
 
873
875
 
874
- def record_audio_with_silence_detection(max_duration: float, disable_silence_detection: bool = False, min_duration: float = 0.0, vad_aggressiveness: Optional[int] = None) -> np.ndarray:
876
+ def record_audio_with_silence_detection(max_duration: float, disable_silence_detection: bool = False, min_duration: float = 0.0, vad_aggressiveness: Optional[int] = None) -> Tuple[np.ndarray, bool]:
875
877
  """Record audio from microphone with automatic silence detection.
876
878
 
877
879
  Uses WebRTC VAD to detect when the user stops speaking and automatically
@@ -884,21 +886,25 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
884
886
  vad_aggressiveness: VAD aggressiveness level (0-3). If None, uses VAD_AGGRESSIVENESS from config
885
887
 
886
888
  Returns:
887
- Numpy array of recorded audio samples
889
+ Tuple of (audio_data, speech_detected):
890
+ - audio_data: Numpy array of recorded audio samples
891
+ - speech_detected: Boolean indicating if speech was detected during recording
888
892
  """
889
893
 
890
894
  logger.info(f"record_audio_with_silence_detection called - VAD_AVAILABLE={VAD_AVAILABLE}, DISABLE_SILENCE_DETECTION={DISABLE_SILENCE_DETECTION}, min_duration={min_duration}")
891
895
 
892
896
  if not VAD_AVAILABLE:
893
897
  logger.warning("webrtcvad not available, falling back to fixed duration recording")
894
- return record_audio(max_duration)
898
+ # For fallback, assume speech is present since we can't detect
899
+ return (record_audio(max_duration), True)
895
900
 
896
901
  if DISABLE_SILENCE_DETECTION or disable_silence_detection:
897
902
  if disable_silence_detection:
898
903
  logger.info("Silence detection disabled for this interaction by request")
899
904
  else:
900
905
  logger.info("Silence detection disabled globally via VOICEMODE_DISABLE_SILENCE_DETECTION")
901
- return record_audio(max_duration)
906
+ # For fallback, assume speech is present since we can't detect
907
+ return (record_audio(max_duration), True)
902
908
 
903
909
  logger.info(f"🎤 Recording with silence detection (max {max_duration}s)...")
904
910
 
@@ -939,6 +945,16 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
939
945
  f"Min duration: {MIN_RECORDING_DURATION}s, "
940
946
  f"Initial grace period: {INITIAL_SILENCE_GRACE_PERIOD}s")
941
947
 
948
+ if VAD_DEBUG:
949
+ logger.info(f"[VAD_DEBUG] Starting VAD recording with config:")
950
+ logger.info(f"[VAD_DEBUG] max_duration: {max_duration}s")
951
+ logger.info(f"[VAD_DEBUG] min_duration: {min_duration}s")
952
+ logger.info(f"[VAD_DEBUG] effective_min_duration: {max(MIN_RECORDING_DURATION, min_duration)}s")
953
+ logger.info(f"[VAD_DEBUG] VAD aggressiveness: {effective_vad_aggressiveness}")
954
+ logger.info(f"[VAD_DEBUG] Silence threshold: {SILENCE_THRESHOLD_MS}ms")
955
+ logger.info(f"[VAD_DEBUG] Sample rate: {SAMPLE_RATE}Hz (VAD using {vad_sample_rate}Hz)")
956
+ logger.info(f"[VAD_DEBUG] Chunk duration: {VAD_CHUNK_DURATION_MS}ms")
957
+
942
958
  def audio_callback(indata, frames, time, status):
943
959
  """Callback for continuous audio stream"""
944
960
  if status:
@@ -978,35 +994,53 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
978
994
  # Check if chunk contains speech
979
995
  try:
980
996
  is_speech = vad.is_speech(chunk_bytes, vad_sample_rate)
997
+ if VAD_DEBUG:
998
+ # Log VAD decision every 500ms for less spam
999
+ if int(recording_duration * 1000) % 500 == 0:
1000
+ rms = np.sqrt(np.mean(chunk.astype(float)**2))
1001
+ logger.info(f"[VAD_DEBUG] t={recording_duration:.1f}s: speech={is_speech}, RMS={rms:.0f}, state={'WAITING' if not speech_detected else 'ACTIVE'}")
981
1002
  except Exception as vad_e:
982
1003
  logger.warning(f"VAD error: {vad_e}, treating as speech")
983
1004
  is_speech = True
984
1005
 
985
- if is_speech:
986
- if not speech_detected:
987
- logger.debug("Speech detected, recording...")
988
- speech_detected = True
989
- silence_duration_ms = 0
1006
+ # State machine for speech detection
1007
+ if not speech_detected:
1008
+ # WAITING_FOR_SPEECH state
1009
+ if is_speech:
1010
+ logger.info("🎤 Speech detected, starting active recording")
1011
+ if VAD_DEBUG:
1012
+ logger.info(f"[VAD_DEBUG] STATE CHANGE: WAITING_FOR_SPEECH -> SPEECH_ACTIVE at t={recording_duration:.1f}s")
1013
+ speech_detected = True
1014
+ silence_duration_ms = 0
1015
+ # No timeout in this state - just keep waiting
1016
+ # The only exit is speech detection or max_duration
990
1017
  else:
991
- silence_duration_ms += VAD_CHUNK_DURATION_MS
992
- if speech_detected and silence_duration_ms % 200 == 0: # Log every 200ms
993
- logger.debug(f"Silence: {silence_duration_ms}ms")
1018
+ # We have detected speech at some point
1019
+ if is_speech:
1020
+ # SPEECH_ACTIVE state - reset silence counter
1021
+ silence_duration_ms = 0
1022
+ else:
1023
+ # SILENCE_AFTER_SPEECH state - accumulate silence
1024
+ silence_duration_ms += VAD_CHUNK_DURATION_MS
1025
+ if VAD_DEBUG and silence_duration_ms % 100 == 0: # More frequent logging in debug mode
1026
+ logger.info(f"[VAD_DEBUG] Accumulating silence: {silence_duration_ms}/{SILENCE_THRESHOLD_MS}ms, t={recording_duration:.1f}s")
1027
+ elif silence_duration_ms % 200 == 0: # Log every 200ms
1028
+ logger.debug(f"Silence: {silence_duration_ms}ms")
1029
+
1030
+ # Check if we should stop due to silence threshold
1031
+ # Use the larger of MIN_RECORDING_DURATION (global) or min_duration (parameter)
1032
+ effective_min_duration = max(MIN_RECORDING_DURATION, min_duration)
1033
+ if recording_duration >= effective_min_duration and silence_duration_ms >= SILENCE_THRESHOLD_MS:
1034
+ logger.info(f"✓ Silence threshold reached after {recording_duration:.1f}s of recording")
1035
+ if VAD_DEBUG:
1036
+ logger.info(f"[VAD_DEBUG] STOP: silence_duration={silence_duration_ms}ms >= threshold={SILENCE_THRESHOLD_MS}ms")
1037
+ logger.info(f"[VAD_DEBUG] STOP: recording_duration={recording_duration:.1f}s >= min_duration={effective_min_duration}s")
1038
+ stop_recording = True
1039
+ elif VAD_DEBUG and recording_duration < effective_min_duration:
1040
+ if int(recording_duration * 1000) % 500 == 0: # Log every 500ms
1041
+ logger.info(f"[VAD_DEBUG] Min duration not met: {recording_duration:.1f}s < {effective_min_duration}s")
994
1042
 
995
1043
  recording_duration += chunk_duration_s
996
-
997
- # Check stop conditions
998
- # Use the larger of MIN_RECORDING_DURATION (global) or min_duration (parameter)
999
- effective_min_duration = max(MIN_RECORDING_DURATION, min_duration)
1000
- if speech_detected and recording_duration >= effective_min_duration:
1001
- if silence_duration_ms >= SILENCE_THRESHOLD_MS:
1002
- logger.info(f"✓ Silence detected after {recording_duration:.1f}s (min: {effective_min_duration:.1f}s), stopping recording")
1003
- stop_recording = True
1004
-
1005
- # Also stop if we haven't detected any speech after a grace period
1006
- # Give user time to start speaking
1007
- if not speech_detected and recording_duration >= INITIAL_SILENCE_GRACE_PERIOD:
1008
- logger.info(f"No speech detected after {INITIAL_SILENCE_GRACE_PERIOD}s grace period, stopping recording")
1009
- stop_recording = True
1010
1044
 
1011
1045
  except queue.Empty:
1012
1046
  # No audio data available, continue waiting
@@ -1018,17 +1052,26 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1018
1052
  # Concatenate all chunks
1019
1053
  if chunks:
1020
1054
  full_recording = np.concatenate(chunks)
1021
- logger.info(f"✓ Recorded {len(full_recording)} samples ({recording_duration:.1f}s)")
1055
+
1056
+ if not speech_detected:
1057
+ logger.info(f"✓ Recording completed ({recording_duration:.1f}s) - No speech detected")
1058
+ if VAD_DEBUG:
1059
+ logger.info(f"[VAD_DEBUG] FINAL STATE: No speech was ever detected during recording")
1060
+ else:
1061
+ logger.info(f"✓ Recorded {len(full_recording)} samples ({recording_duration:.1f}s) with speech")
1062
+ if VAD_DEBUG:
1063
+ logger.info(f"[VAD_DEBUG] FINAL STATE: Speech was detected, recording complete")
1022
1064
 
1023
1065
  if DEBUG:
1024
1066
  # Calculate RMS for debug
1025
1067
  rms = np.sqrt(np.mean(full_recording.astype(float) ** 2))
1026
1068
  logger.debug(f"Recording stats - RMS: {rms:.2f}, Speech detected: {speech_detected}")
1027
1069
 
1028
- return full_recording
1070
+ # Return tuple: (audio_data, speech_detected)
1071
+ return (full_recording, speech_detected)
1029
1072
  else:
1030
1073
  logger.warning("No audio chunks recorded")
1031
- return np.array([])
1074
+ return (np.array([]), False)
1032
1075
 
1033
1076
  except Exception as e:
1034
1077
  logger.error(f"Recording with VAD failed: {e}")
@@ -1041,7 +1084,8 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1041
1084
  logger.error(f"\n{help_message}")
1042
1085
 
1043
1086
  logger.info("Falling back to fixed duration recording")
1044
- return record_audio(max_duration)
1087
+ # For fallback, assume speech is present since we can't detect
1088
+ return (record_audio(max_duration), True)
1045
1089
 
1046
1090
  finally:
1047
1091
  # Restore stdio
@@ -1055,7 +1099,8 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1055
1099
  except Exception as e:
1056
1100
  logger.error(f"VAD initialization failed: {e}")
1057
1101
  logger.info("Falling back to fixed duration recording")
1058
- return record_audio(max_duration)
1102
+ # For fallback, assume speech is present since we can't detect
1103
+ return (record_audio(max_duration), True)
1059
1104
 
1060
1105
 
1061
1106
  async def check_livekit_available() -> bool:
@@ -1248,7 +1293,8 @@ async def converse(
1248
1293
  audio_format: Optional[str] = None,
1249
1294
  disable_silence_detection: Union[bool, str] = False,
1250
1295
  speed: Optional[float] = None,
1251
- vad_aggressiveness: Optional[int] = None
1296
+ vad_aggressiveness: Optional[int] = None,
1297
+ skip_tts: Optional[Union[bool, str]] = None
1252
1298
  ) -> str:
1253
1299
  """Have a voice conversation - speak a message and optionally listen for response.
1254
1300
 
@@ -1320,6 +1366,11 @@ async def converse(
1320
1366
 
1321
1367
  Use lower values (0-1) in quiet environments to catch all speech
1322
1368
  Use higher values (2-3) in noisy environments to reduce false triggers
1369
+ skip_tts: Skip text-to-speech and only show text (default: None uses VOICEMODE_SKIP_TTS env var)
1370
+ When True: Skip TTS for faster response, text-only output
1371
+ When False: Always use TTS regardless of environment setting
1372
+ When None: Follow VOICEMODE_SKIP_TTS environment variable
1373
+ Useful for rapid development iterations or when voice isn't needed
1323
1374
  If wait_for_response is False: Confirmation that message was spoken
1324
1375
  If wait_for_response is True: The voice response received (or error/timeout message)
1325
1376
 
@@ -1360,6 +1411,12 @@ async def converse(
1360
1411
 
1361
1412
  Remember: Lower values (0-1) = more permissive, may detect non-speech as speech
1362
1413
  Higher values (2-3) = more strict, may miss soft speech or whispers
1414
+
1415
+ Skip TTS Examples:
1416
+ - Fast iteration mode: converse("Processing your request", skip_tts=True) # Text only, no voice
1417
+ - Important announcement: converse("Warning: System will restart", skip_tts=False) # Always use voice
1418
+ - Quick confirmation: converse("Done!", skip_tts=True, wait_for_response=False) # Fast text-only
1419
+ - Follow user preference: converse("Hello") # Uses VOICEMODE_SKIP_TTS setting
1363
1420
  """
1364
1421
  # Convert string booleans to actual booleans
1365
1422
  if isinstance(wait_for_response, str):
@@ -1368,6 +1425,16 @@ async def converse(
1368
1425
  disable_silence_detection = disable_silence_detection.lower() in ('true', '1', 'yes', 'on')
1369
1426
  if isinstance(audio_feedback, str):
1370
1427
  audio_feedback = audio_feedback.lower() in ('true', '1', 'yes', 'on')
1428
+ if skip_tts is not None and isinstance(skip_tts, str):
1429
+ skip_tts = skip_tts.lower() in ('true', '1', 'yes', 'on')
1430
+
1431
+ # Determine whether to skip TTS
1432
+ if skip_tts is not None:
1433
+ # Parameter explicitly set, use it
1434
+ should_skip_tts = skip_tts
1435
+ else:
1436
+ # Use global setting
1437
+ should_skip_tts = SKIP_TTS
1371
1438
 
1372
1439
  # Convert string speed to float
1373
1440
  if speed is not None and isinstance(speed, str):
@@ -1457,15 +1524,26 @@ async def converse(
1457
1524
  if not wait_for_response:
1458
1525
  try:
1459
1526
  async with audio_operation_lock:
1460
- success, tts_metrics, tts_config = await text_to_speech_with_failover(
1461
- message=message,
1462
- voice=voice,
1463
- model=tts_model,
1464
- instructions=tts_instructions,
1465
- audio_format=audio_format,
1466
- initial_provider=tts_provider,
1467
- speed=speed
1468
- )
1527
+ if should_skip_tts:
1528
+ # Skip TTS entirely
1529
+ success = True
1530
+ tts_metrics = {
1531
+ 'ttfa': 0,
1532
+ 'generation': 0,
1533
+ 'playback': 0,
1534
+ 'total': 0
1535
+ }
1536
+ tts_config = {'provider': 'no-op', 'voice': 'none'}
1537
+ else:
1538
+ success, tts_metrics, tts_config = await text_to_speech_with_failover(
1539
+ message=message,
1540
+ voice=voice,
1541
+ model=tts_model,
1542
+ instructions=tts_instructions,
1543
+ audio_format=audio_format,
1544
+ initial_provider=tts_provider,
1545
+ speed=speed
1546
+ )
1469
1547
 
1470
1548
  # Include timing info if available
1471
1549
  timing_info = ""
@@ -1589,15 +1667,26 @@ async def converse(
1589
1667
  async with audio_operation_lock:
1590
1668
  # Speak the message
1591
1669
  tts_start = time.perf_counter()
1592
- tts_success, tts_metrics, tts_config = await text_to_speech_with_failover(
1593
- message=message,
1594
- voice=voice,
1595
- model=tts_model,
1596
- instructions=tts_instructions,
1597
- audio_format=audio_format,
1598
- initial_provider=tts_provider,
1599
- speed=speed
1600
- )
1670
+ if should_skip_tts:
1671
+ # Skip TTS entirely for faster response
1672
+ tts_success = True
1673
+ tts_metrics = {
1674
+ 'ttfa': 0,
1675
+ 'generation': 0,
1676
+ 'playback': 0,
1677
+ 'total': 0
1678
+ }
1679
+ tts_config = {'provider': 'no-op', 'voice': 'none'}
1680
+ else:
1681
+ tts_success, tts_metrics, tts_config = await text_to_speech_with_failover(
1682
+ message=message,
1683
+ voice=voice,
1684
+ model=tts_model,
1685
+ instructions=tts_instructions,
1686
+ audio_format=audio_format,
1687
+ initial_provider=tts_provider,
1688
+ speed=speed
1689
+ )
1601
1690
 
1602
1691
  # Add TTS sub-metrics
1603
1692
  if tts_metrics:
@@ -1668,7 +1757,7 @@ async def converse(
1668
1757
 
1669
1758
  record_start = time.perf_counter()
1670
1759
  logger.debug(f"About to call record_audio_with_silence_detection with duration={listen_duration}, disable_silence_detection={disable_silence_detection}, min_duration={min_listen_duration}, vad_aggressiveness={vad_aggressiveness}")
1671
- audio_data = await asyncio.get_event_loop().run_in_executor(
1760
+ audio_data, speech_detected = await asyncio.get_event_loop().run_in_executor(
1672
1761
  None, record_audio_with_silence_detection, listen_duration, disable_silence_detection, min_listen_duration, vad_aggressiveness
1673
1762
  )
1674
1763
  timings['record'] = time.perf_counter() - record_start
@@ -1691,14 +1780,27 @@ async def converse(
1691
1780
  result = "Error: Could not record audio"
1692
1781
  return result
1693
1782
 
1694
- # Convert to text
1695
- # Log STT start
1696
- if event_logger:
1697
- event_logger.log_event(event_logger.STT_START)
1698
-
1699
- stt_start = time.perf_counter()
1700
- response_text = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
1701
- timings['stt'] = time.perf_counter() - stt_start
1783
+ # Check if no speech was detected
1784
+ if not speech_detected:
1785
+ logger.info("No speech detected during recording - skipping STT processing")
1786
+ response_text = None
1787
+ timings['stt'] = 0.0
1788
+
1789
+ # Still save the audio if configured
1790
+ if SAVE_AUDIO and AUDIO_DIR:
1791
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1792
+ audio_path = os.path.join(AUDIO_DIR, f"no_speech_{timestamp}.wav")
1793
+ write(audio_path, SAMPLE_RATE, audio_data)
1794
+ logger.debug(f"Saved no-speech audio to: {audio_path}")
1795
+ else:
1796
+ # Convert to text
1797
+ # Log STT start
1798
+ if event_logger:
1799
+ event_logger.log_event(event_logger.STT_START)
1800
+
1801
+ stt_start = time.perf_counter()
1802
+ response_text = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
1803
+ timings['stt'] = time.perf_counter() - stt_start
1702
1804
 
1703
1805
  # Log STT complete
1704
1806
  if event_logger:
@@ -14,7 +14,7 @@ import psutil
14
14
 
15
15
  from voice_mode.server import mcp
16
16
  from voice_mode.config import WHISPER_PORT, KOKORO_PORT, LIVEKIT_PORT, SERVICE_AUTO_ENABLE
17
- from voice_mode.utils.services.common import find_process_by_port
17
+ from voice_mode.utils.services.common import find_process_by_port, check_service_status
18
18
  from voice_mode.utils.services.whisper_helpers import find_whisper_server, find_whisper_model
19
19
  from voice_mode.utils.services.kokoro_helpers import find_kokoro_fastapi, has_gpu_support
20
20
 
@@ -195,10 +195,16 @@ async def status_service(service_name: str) -> str:
195
195
  port = LIVEKIT_PORT
196
196
  else: # frontend
197
197
  port = 3000
198
- proc = find_process_by_port(port)
199
198
 
200
- if not proc:
201
- return f"{service_name.capitalize()} is not running on port {port}"
199
+ status, proc = check_service_status(port)
200
+
201
+ if status == "not_available":
202
+ return f"❌ {service_name.capitalize()} is not available"
203
+ elif status == "forwarded":
204
+ return f"""🔄 {service_name.capitalize()} is available via port forwarding
205
+ Port: {port} (forwarded)
206
+ Local process: Not running
207
+ Remote: Accessible"""
202
208
 
203
209
  try:
204
210
  with proc.oneshot():
@@ -269,7 +275,7 @@ async def status_service(service_name: str) -> str:
269
275
  if extra_info_parts:
270
276
  extra_info = "\n " + "\n ".join(extra_info_parts)
271
277
 
272
- return f"""✅ {service_name.capitalize()} is running
278
+ return f"""✅ {service_name.capitalize()} is running locally
273
279
  PID: {proc.pid}
274
280
  Port: {port}
275
281
  CPU: {cpu_percent:.1f}%
@@ -0,0 +1,80 @@
1
+ """Common utilities for service management tools."""
2
+
3
+ import psutil
4
+ import socket
5
+ from typing import Optional, Tuple
6
+ import logging
7
+
8
+ logger = logging.getLogger("voice-mode")
9
+
10
+
11
+ def find_process_by_port(port: int) -> Optional[psutil.Process]:
12
+ """Find a process listening on the specified port.
13
+
14
+ Returns None if port is only accessible via SSH forwarding or other non-local means.
15
+ """
16
+ try:
17
+ for proc in psutil.process_iter(['pid', 'name']):
18
+ try:
19
+ # Skip if we can't access process info (might be another user's process)
20
+ if not proc.is_running():
21
+ continue
22
+
23
+ # Skip SSH processes - these are port forwards, not actual services
24
+ proc_name = proc.name().lower()
25
+ if proc_name in ['ssh', 'sshd']:
26
+ continue
27
+
28
+ for conn in proc.connections():
29
+ if conn.laddr.port == port and conn.status == 'LISTEN':
30
+ # Verify this is a real local process
31
+ try:
32
+ # Try to access basic process info to ensure it's real
33
+ _ = proc.pid
34
+ _ = proc.create_time()
35
+ return proc
36
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
37
+ # Process doesn't actually exist or we can't access it
38
+ continue
39
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
40
+ continue
41
+ except Exception as e:
42
+ logger.error(f"Error finding process by port: {e}")
43
+ return None
44
+
45
+
46
+ def is_port_accessible(port: int, host: str = "127.0.0.1", timeout: float = 1.0) -> bool:
47
+ """Check if a port is accessible (can connect to it).
48
+
49
+ This will return True for both locally running services and SSH-forwarded ports.
50
+ """
51
+ try:
52
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
53
+ sock.settimeout(timeout)
54
+ result = sock.connect_ex((host, port))
55
+ return result == 0
56
+ except Exception as e:
57
+ logger.error(f"Error checking port accessibility: {e}")
58
+ return False
59
+
60
+
61
+ def check_service_status(port: int) -> Tuple[str, Optional[psutil.Process]]:
62
+ """Check the status of a service on a given port.
63
+
64
+ Returns:
65
+ Tuple of (status, process):
66
+ - ("local", process) if running locally
67
+ - ("forwarded", None) if accessible but not local
68
+ - ("not_available", None) if not accessible at all
69
+ """
70
+ # First check if there's a local process
71
+ proc = find_process_by_port(port)
72
+ if proc:
73
+ return ("local", proc)
74
+
75
+ # No local process, check if port is accessible (might be forwarded)
76
+ if is_port_accessible(port):
77
+ return ("forwarded", None)
78
+
79
+ # Not accessible at all
80
+ return ("not_available", None)
@@ -1,22 +0,0 @@
1
- """Common utilities for service management tools."""
2
-
3
- import psutil
4
- from typing import Optional
5
- import logging
6
-
7
- logger = logging.getLogger("voice-mode")
8
-
9
-
10
- def find_process_by_port(port: int) -> Optional[psutil.Process]:
11
- """Find a process listening on the specified port."""
12
- try:
13
- for proc in psutil.process_iter(['pid', 'name']):
14
- try:
15
- for conn in proc.connections():
16
- if conn.laddr.port == port and conn.status == 'LISTEN':
17
- return proc
18
- except (psutil.NoSuchProcess, psutil.AccessDenied):
19
- continue
20
- except Exception as e:
21
- logger.error(f"Error finding process by port: {e}")
22
- return None
File without changes
File without changes
File without changes
File without changes