voice-mode 2.23.0__tar.gz → 2.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {voice_mode-2.23.0 → voice_mode-2.24.0}/CHANGELOG.md +10 -2
  2. {voice_mode-2.23.0 → voice_mode-2.24.0}/PKG-INFO +1 -1
  3. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/__version__.py +1 -1
  4. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/config.py +1 -0
  5. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/converse.py +97 -40
  6. {voice_mode-2.23.0 → voice_mode-2.24.0}/.gitignore +0 -0
  7. {voice_mode-2.23.0 → voice_mode-2.24.0}/README.md +0 -0
  8. {voice_mode-2.23.0 → voice_mode-2.24.0}/build_hooks.py +0 -0
  9. {voice_mode-2.23.0 → voice_mode-2.24.0}/pyproject.toml +0 -0
  10. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/__init__.py +0 -0
  11. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/__main__.py +0 -0
  12. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/cli.py +0 -0
  13. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/cli_commands/__init__.py +0 -0
  14. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/cli_commands/exchanges.py +0 -0
  15. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/conversation_logger.py +0 -0
  16. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/core.py +0 -0
  17. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/data/versions.json +0 -0
  18. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/__init__.py +0 -0
  19. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/conversations.py +0 -0
  20. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/filters.py +0 -0
  21. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/formatters.py +0 -0
  22. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/models.py +0 -0
  23. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/reader.py +0 -0
  24. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/exchanges/stats.py +0 -0
  25. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/README.md +0 -0
  26. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/app/api/connection-details/route.ts +0 -0
  27. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/app/favicon.ico +0 -0
  28. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/app/globals.css +0 -0
  29. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/app/layout.tsx +0 -0
  30. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/app/page.tsx +0 -0
  31. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/components/CloseIcon.tsx +0 -0
  32. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/components/NoAgentNotification.tsx +0 -0
  33. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/components/TranscriptionView.tsx +0 -0
  34. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/hooks/useCombinedTranscriptions.ts +0 -0
  35. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/hooks/useLocalMicTrack.ts +0 -0
  36. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/next-env.d.ts +0 -0
  37. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/next.config.mjs +0 -0
  38. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/package-lock.json +0 -0
  39. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/package.json +0 -0
  40. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/pnpm-lock.yaml +0 -0
  41. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/postcss.config.mjs +0 -0
  42. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/tailwind.config.ts +0 -0
  43. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/frontend/tsconfig.json +0 -0
  44. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/prompts/README.md +0 -0
  45. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/prompts/__init__.py +0 -0
  46. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/prompts/converse.py +0 -0
  47. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/prompts/release_notes.py +0 -0
  48. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/prompts/services.py +0 -0
  49. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/provider_discovery.py +0 -0
  50. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/providers.py +0 -0
  51. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/__init__.py +0 -0
  52. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/audio_files.py +0 -0
  53. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/changelog.py +0 -0
  54. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/configuration.py +0 -0
  55. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/statistics.py +0 -0
  56. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/version.py +0 -0
  57. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/resources/whisper_models.py +0 -0
  58. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/server.py +0 -0
  59. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/shared.py +0 -0
  60. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/simple_failover.py +0 -0
  61. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/statistics.py +0 -0
  62. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/streaming.py +0 -0
  63. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.frontend.plist +0 -0
  64. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.kokoro.plist +0 -0
  65. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.livekit.plist +0 -0
  66. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/com.voicemode.whisper.plist +0 -0
  67. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/start-kokoro-with-health-check.sh +0 -0
  68. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/launchd/start-whisper-with-health-check.sh +0 -0
  69. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-frontend.service +0 -0
  70. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-kokoro.service +0 -0
  71. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-livekit.service +0 -0
  72. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/templates/systemd/voicemode-whisper.service +0 -0
  73. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/__init__.py +0 -0
  74. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/configuration_management.py +0 -0
  75. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/dependencies.py +0 -0
  76. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/devices.py +0 -0
  77. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/diagnostics.py +0 -0
  78. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/providers.py +0 -0
  79. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/service.py +0 -0
  80. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/kokoro/install.py +0 -0
  81. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/kokoro/uninstall.py +0 -0
  82. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/list_versions.py +0 -0
  83. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/__init__.py +0 -0
  84. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/frontend.py +0 -0
  85. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/install.py +0 -0
  86. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/production_server.py +0 -0
  87. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/livekit/uninstall.py +0 -0
  88. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/version_info.py +0 -0
  89. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/download_model.py +0 -0
  90. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/install.py +0 -0
  91. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/services/whisper/uninstall.py +0 -0
  92. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/statistics.py +0 -0
  93. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/tools/voice_registry.py +0 -0
  94. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/__init__.py +0 -0
  95. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/audio_diagnostics.py +0 -0
  96. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/event_logger.py +0 -0
  97. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/ffmpeg_check.py +0 -0
  98. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/format_migration.py +0 -0
  99. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/gpu_detection.py +0 -0
  100. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/migration_helpers.py +0 -0
  101. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/services/common.py +0 -0
  102. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/services/kokoro_helpers.py +0 -0
  103. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/services/livekit_helpers.py +0 -0
  104. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/services/whisper_helpers.py +0 -0
  105. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/utils/version_helpers.py +0 -0
  106. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/version.py +0 -0
  107. {voice_mode-2.23.0 → voice_mode-2.24.0}/voice_mode/voice_preferences.py +0 -0
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [2.24.0] - 2025-08-16
11
+
12
+ ### Added
13
+ - **VAD debugging mode** - Comprehensive debugging for Voice Activity Detection
14
+ - New `VOICEMODE_VAD_DEBUG` environment variable enables detailed VAD logging
15
+ - Shows real-time speech detection decisions, state transitions, and timing
16
+ - Helps diagnose issues where recording stops before speech or cuts off early
17
+ - Added test script `scripts/test-vad-enhancement.py` for VAD testing
18
+ - Documented in `docs/vad-debugging.md` with common issues and solutions
19
+
10
20
  ## [2.23.0] - 2025-08-16
11
21
 
12
22
  ### Added
@@ -29,8 +39,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
29
39
 
30
40
  ## [2.22.3] - 2025-08-16
31
41
 
32
- ## [2.23.0] - 2025-08-16
33
-
34
42
  ### Fixed
35
43
  - **Service auto-enable error** - Fix 'FunctionTool' object is not callable
36
44
  - Changed whisper and kokoro installers to use `enable_service` function instead of MCP tool
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: voice-mode
3
- Version: 2.23.0
3
+ Version: 2.24.0
4
4
  Summary: VoiceMode - Voice interaction capabilities for AI assistants (formerly voice-mcp)
5
5
  Project-URL: Homepage, https://github.com/mbailey/voicemode
6
6
  Project-URL: Repository, https://github.com/mbailey/voicemode
@@ -1,3 +1,3 @@
1
1
  # This file is automatically updated by 'make release'
2
2
  # Do not edit manually
3
- __version__ = "2.23.0"
3
+ __version__ = "2.24.0"
@@ -162,6 +162,7 @@ MODELS_DIR = Path(os.getenv("VOICEMODE_MODELS_DIR", str(BASE_DIR / "models")))
162
162
  # Debug configuration
163
163
  DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() in ("true", "1", "yes", "on")
164
164
  TRACE_DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() == "trace"
165
+ VAD_DEBUG = os.getenv("VOICEMODE_VAD_DEBUG", "").lower() in ("true", "1", "yes", "on")
165
166
  DEBUG_DIR = LOGS_DIR / "debug" # Debug files now go under logs
166
167
 
167
168
  # Master save-all configuration
@@ -32,6 +32,7 @@ from voice_mode.config import (
32
32
  CHANNELS,
33
33
  DEBUG,
34
34
  DEBUG_DIR,
35
+ VAD_DEBUG,
35
36
  SAVE_AUDIO,
36
37
  AUDIO_DIR,
37
38
  OPENAI_API_KEY,
@@ -872,7 +873,7 @@ def record_audio(duration: float) -> np.ndarray:
872
873
  sys.stderr = original_stderr
873
874
 
874
875
 
875
- def record_audio_with_silence_detection(max_duration: float, disable_silence_detection: bool = False, min_duration: float = 0.0, vad_aggressiveness: Optional[int] = None) -> np.ndarray:
876
+ def record_audio_with_silence_detection(max_duration: float, disable_silence_detection: bool = False, min_duration: float = 0.0, vad_aggressiveness: Optional[int] = None) -> Tuple[np.ndarray, bool]:
876
877
  """Record audio from microphone with automatic silence detection.
877
878
 
878
879
  Uses WebRTC VAD to detect when the user stops speaking and automatically
@@ -885,21 +886,25 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
885
886
  vad_aggressiveness: VAD aggressiveness level (0-3). If None, uses VAD_AGGRESSIVENESS from config
886
887
 
887
888
  Returns:
888
- Numpy array of recorded audio samples
889
+ Tuple of (audio_data, speech_detected):
890
+ - audio_data: Numpy array of recorded audio samples
891
+ - speech_detected: Boolean indicating if speech was detected during recording
889
892
  """
890
893
 
891
894
  logger.info(f"record_audio_with_silence_detection called - VAD_AVAILABLE={VAD_AVAILABLE}, DISABLE_SILENCE_DETECTION={DISABLE_SILENCE_DETECTION}, min_duration={min_duration}")
892
895
 
893
896
  if not VAD_AVAILABLE:
894
897
  logger.warning("webrtcvad not available, falling back to fixed duration recording")
895
- return record_audio(max_duration)
898
+ # For fallback, assume speech is present since we can't detect
899
+ return (record_audio(max_duration), True)
896
900
 
897
901
  if DISABLE_SILENCE_DETECTION or disable_silence_detection:
898
902
  if disable_silence_detection:
899
903
  logger.info("Silence detection disabled for this interaction by request")
900
904
  else:
901
905
  logger.info("Silence detection disabled globally via VOICEMODE_DISABLE_SILENCE_DETECTION")
902
- return record_audio(max_duration)
906
+ # For fallback, assume speech is present since we can't detect
907
+ return (record_audio(max_duration), True)
903
908
 
904
909
  logger.info(f"🎤 Recording with silence detection (max {max_duration}s)...")
905
910
 
@@ -940,6 +945,16 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
940
945
  f"Min duration: {MIN_RECORDING_DURATION}s, "
941
946
  f"Initial grace period: {INITIAL_SILENCE_GRACE_PERIOD}s")
942
947
 
948
+ if VAD_DEBUG:
949
+ logger.info(f"[VAD_DEBUG] Starting VAD recording with config:")
950
+ logger.info(f"[VAD_DEBUG] max_duration: {max_duration}s")
951
+ logger.info(f"[VAD_DEBUG] min_duration: {min_duration}s")
952
+ logger.info(f"[VAD_DEBUG] effective_min_duration: {max(MIN_RECORDING_DURATION, min_duration)}s")
953
+ logger.info(f"[VAD_DEBUG] VAD aggressiveness: {effective_vad_aggressiveness}")
954
+ logger.info(f"[VAD_DEBUG] Silence threshold: {SILENCE_THRESHOLD_MS}ms")
955
+ logger.info(f"[VAD_DEBUG] Sample rate: {SAMPLE_RATE}Hz (VAD using {vad_sample_rate}Hz)")
956
+ logger.info(f"[VAD_DEBUG] Chunk duration: {VAD_CHUNK_DURATION_MS}ms")
957
+
943
958
  def audio_callback(indata, frames, time, status):
944
959
  """Callback for continuous audio stream"""
945
960
  if status:
@@ -979,35 +994,53 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
979
994
  # Check if chunk contains speech
980
995
  try:
981
996
  is_speech = vad.is_speech(chunk_bytes, vad_sample_rate)
997
+ if VAD_DEBUG:
998
+ # Log VAD decision every 500ms for less spam
999
+ if int(recording_duration * 1000) % 500 == 0:
1000
+ rms = np.sqrt(np.mean(chunk.astype(float)**2))
1001
+ logger.info(f"[VAD_DEBUG] t={recording_duration:.1f}s: speech={is_speech}, RMS={rms:.0f}, state={'WAITING' if not speech_detected else 'ACTIVE'}")
982
1002
  except Exception as vad_e:
983
1003
  logger.warning(f"VAD error: {vad_e}, treating as speech")
984
1004
  is_speech = True
985
1005
 
986
- if is_speech:
987
- if not speech_detected:
988
- logger.debug("Speech detected, recording...")
989
- speech_detected = True
990
- silence_duration_ms = 0
1006
+ # State machine for speech detection
1007
+ if not speech_detected:
1008
+ # WAITING_FOR_SPEECH state
1009
+ if is_speech:
1010
+ logger.info("🎤 Speech detected, starting active recording")
1011
+ if VAD_DEBUG:
1012
+ logger.info(f"[VAD_DEBUG] STATE CHANGE: WAITING_FOR_SPEECH -> SPEECH_ACTIVE at t={recording_duration:.1f}s")
1013
+ speech_detected = True
1014
+ silence_duration_ms = 0
1015
+ # No timeout in this state - just keep waiting
1016
+ # The only exit is speech detection or max_duration
991
1017
  else:
992
- silence_duration_ms += VAD_CHUNK_DURATION_MS
993
- if speech_detected and silence_duration_ms % 200 == 0: # Log every 200ms
994
- logger.debug(f"Silence: {silence_duration_ms}ms")
1018
+ # We have detected speech at some point
1019
+ if is_speech:
1020
+ # SPEECH_ACTIVE state - reset silence counter
1021
+ silence_duration_ms = 0
1022
+ else:
1023
+ # SILENCE_AFTER_SPEECH state - accumulate silence
1024
+ silence_duration_ms += VAD_CHUNK_DURATION_MS
1025
+ if VAD_DEBUG and silence_duration_ms % 100 == 0: # More frequent logging in debug mode
1026
+ logger.info(f"[VAD_DEBUG] Accumulating silence: {silence_duration_ms}/{SILENCE_THRESHOLD_MS}ms, t={recording_duration:.1f}s")
1027
+ elif silence_duration_ms % 200 == 0: # Log every 200ms
1028
+ logger.debug(f"Silence: {silence_duration_ms}ms")
1029
+
1030
+ # Check if we should stop due to silence threshold
1031
+ # Use the larger of MIN_RECORDING_DURATION (global) or min_duration (parameter)
1032
+ effective_min_duration = max(MIN_RECORDING_DURATION, min_duration)
1033
+ if recording_duration >= effective_min_duration and silence_duration_ms >= SILENCE_THRESHOLD_MS:
1034
+ logger.info(f"✓ Silence threshold reached after {recording_duration:.1f}s of recording")
1035
+ if VAD_DEBUG:
1036
+ logger.info(f"[VAD_DEBUG] STOP: silence_duration={silence_duration_ms}ms >= threshold={SILENCE_THRESHOLD_MS}ms")
1037
+ logger.info(f"[VAD_DEBUG] STOP: recording_duration={recording_duration:.1f}s >= min_duration={effective_min_duration}s")
1038
+ stop_recording = True
1039
+ elif VAD_DEBUG and recording_duration < effective_min_duration:
1040
+ if int(recording_duration * 1000) % 500 == 0: # Log every 500ms
1041
+ logger.info(f"[VAD_DEBUG] Min duration not met: {recording_duration:.1f}s < {effective_min_duration}s")
995
1042
 
996
1043
  recording_duration += chunk_duration_s
997
-
998
- # Check stop conditions
999
- # Use the larger of MIN_RECORDING_DURATION (global) or min_duration (parameter)
1000
- effective_min_duration = max(MIN_RECORDING_DURATION, min_duration)
1001
- if speech_detected and recording_duration >= effective_min_duration:
1002
- if silence_duration_ms >= SILENCE_THRESHOLD_MS:
1003
- logger.info(f"✓ Silence detected after {recording_duration:.1f}s (min: {effective_min_duration:.1f}s), stopping recording")
1004
- stop_recording = True
1005
-
1006
- # Also stop if we haven't detected any speech after a grace period
1007
- # Give user time to start speaking
1008
- if not speech_detected and recording_duration >= INITIAL_SILENCE_GRACE_PERIOD:
1009
- logger.info(f"No speech detected after {INITIAL_SILENCE_GRACE_PERIOD}s grace period, stopping recording")
1010
- stop_recording = True
1011
1044
 
1012
1045
  except queue.Empty:
1013
1046
  # No audio data available, continue waiting
@@ -1019,17 +1052,26 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1019
1052
  # Concatenate all chunks
1020
1053
  if chunks:
1021
1054
  full_recording = np.concatenate(chunks)
1022
- logger.info(f"✓ Recorded {len(full_recording)} samples ({recording_duration:.1f}s)")
1055
+
1056
+ if not speech_detected:
1057
+ logger.info(f"✓ Recording completed ({recording_duration:.1f}s) - No speech detected")
1058
+ if VAD_DEBUG:
1059
+ logger.info(f"[VAD_DEBUG] FINAL STATE: No speech was ever detected during recording")
1060
+ else:
1061
+ logger.info(f"✓ Recorded {len(full_recording)} samples ({recording_duration:.1f}s) with speech")
1062
+ if VAD_DEBUG:
1063
+ logger.info(f"[VAD_DEBUG] FINAL STATE: Speech was detected, recording complete")
1023
1064
 
1024
1065
  if DEBUG:
1025
1066
  # Calculate RMS for debug
1026
1067
  rms = np.sqrt(np.mean(full_recording.astype(float) ** 2))
1027
1068
  logger.debug(f"Recording stats - RMS: {rms:.2f}, Speech detected: {speech_detected}")
1028
1069
 
1029
- return full_recording
1070
+ # Return tuple: (audio_data, speech_detected)
1071
+ return (full_recording, speech_detected)
1030
1072
  else:
1031
1073
  logger.warning("No audio chunks recorded")
1032
- return np.array([])
1074
+ return (np.array([]), False)
1033
1075
 
1034
1076
  except Exception as e:
1035
1077
  logger.error(f"Recording with VAD failed: {e}")
@@ -1042,7 +1084,8 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1042
1084
  logger.error(f"\n{help_message}")
1043
1085
 
1044
1086
  logger.info("Falling back to fixed duration recording")
1045
- return record_audio(max_duration)
1087
+ # For fallback, assume speech is present since we can't detect
1088
+ return (record_audio(max_duration), True)
1046
1089
 
1047
1090
  finally:
1048
1091
  # Restore stdio
@@ -1056,7 +1099,8 @@ def record_audio_with_silence_detection(max_duration: float, disable_silence_det
1056
1099
  except Exception as e:
1057
1100
  logger.error(f"VAD initialization failed: {e}")
1058
1101
  logger.info("Falling back to fixed duration recording")
1059
- return record_audio(max_duration)
1102
+ # For fallback, assume speech is present since we can't detect
1103
+ return (record_audio(max_duration), True)
1060
1104
 
1061
1105
 
1062
1106
  async def check_livekit_available() -> bool:
@@ -1713,7 +1757,7 @@ async def converse(
1713
1757
 
1714
1758
  record_start = time.perf_counter()
1715
1759
  logger.debug(f"About to call record_audio_with_silence_detection with duration={listen_duration}, disable_silence_detection={disable_silence_detection}, min_duration={min_listen_duration}, vad_aggressiveness={vad_aggressiveness}")
1716
- audio_data = await asyncio.get_event_loop().run_in_executor(
1760
+ audio_data, speech_detected = await asyncio.get_event_loop().run_in_executor(
1717
1761
  None, record_audio_with_silence_detection, listen_duration, disable_silence_detection, min_listen_duration, vad_aggressiveness
1718
1762
  )
1719
1763
  timings['record'] = time.perf_counter() - record_start
@@ -1736,14 +1780,27 @@ async def converse(
1736
1780
  result = "Error: Could not record audio"
1737
1781
  return result
1738
1782
 
1739
- # Convert to text
1740
- # Log STT start
1741
- if event_logger:
1742
- event_logger.log_event(event_logger.STT_START)
1743
-
1744
- stt_start = time.perf_counter()
1745
- response_text = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
1746
- timings['stt'] = time.perf_counter() - stt_start
1783
+ # Check if no speech was detected
1784
+ if not speech_detected:
1785
+ logger.info("No speech detected during recording - skipping STT processing")
1786
+ response_text = None
1787
+ timings['stt'] = 0.0
1788
+
1789
+ # Still save the audio if configured
1790
+ if SAVE_AUDIO and AUDIO_DIR:
1791
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1792
+ audio_path = os.path.join(AUDIO_DIR, f"no_speech_{timestamp}.wav")
1793
+ write(audio_path, SAMPLE_RATE, audio_data)
1794
+ logger.debug(f"Saved no-speech audio to: {audio_path}")
1795
+ else:
1796
+ # Convert to text
1797
+ # Log STT start
1798
+ if event_logger:
1799
+ event_logger.log_event(event_logger.STT_START)
1800
+
1801
+ stt_start = time.perf_counter()
1802
+ response_text = await speech_to_text(audio_data, SAVE_AUDIO, AUDIO_DIR if SAVE_AUDIO else None, transport)
1803
+ timings['stt'] = time.perf_counter() - stt_start
1747
1804
 
1748
1805
  # Log STT complete
1749
1806
  if event_logger:
File without changes
File without changes
File without changes
File without changes