talktocursor 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTALL.md +249 -0
- package/README.md +177 -0
- package/build/config.js +82 -0
- package/build/index.js +166 -0
- package/build/settings-server.js +124 -0
- package/package.json +54 -0
- package/public/index.html +1574 -0
- package/scripts/auto-submit.py +394 -0
- package/scripts/silence_detector.py +146 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Auto-Submit and Wispr Voice Loop for Cursor TTS MCP
|
|
4
|
+
|
|
5
|
+
Combines two features:
|
|
6
|
+
1. Auto-submit: Detects when text appears in the focused field and auto-presses Enter
|
|
7
|
+
2. Wispr voice loop: Watches for listen signals, triggers Wispr, detects silence, and pastes
|
|
8
|
+
|
|
9
|
+
How it works:
|
|
10
|
+
- Monitors the focused text field via Accessibility API for auto-submit
|
|
11
|
+
- Watches for listen-signal.json from the MCP server
|
|
12
|
+
- When signal detected, starts Wispr and monitors mic for silence
|
|
13
|
+
- Registers a manual trigger hotkey to start Wispr anytime
|
|
14
|
+
|
|
15
|
+
Requires macOS Accessibility permissions:
|
|
16
|
+
System Settings > Privacy & Security > Accessibility
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import time
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
import threading
|
|
24
|
+
import subprocess
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from ApplicationServices import (
|
|
27
|
+
AXUIElementCreateSystemWide,
|
|
28
|
+
AXUIElementCopyAttributeValue,
|
|
29
|
+
AXIsProcessTrusted,
|
|
30
|
+
)
|
|
31
|
+
from pynput import keyboard
|
|
32
|
+
from pynput.keyboard import Key, KeyCode, Controller, HotKey, GlobalHotKeys
|
|
33
|
+
|
|
34
|
+
# Import our silence detector
|
|
35
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
36
|
+
from silence_detector import wait_for_silence
|
|
37
|
+
|
|
38
|
+
# ─── Configuration ───────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
CONFIG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'config.json')
|
|
41
|
+
SIGNAL_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'listen-signal.json')
|
|
42
|
+
TTS_COMPLETE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'tts-complete.json')
|
|
43
|
+
|
|
44
|
+
def load_config():
|
|
45
|
+
defaults = {
|
|
46
|
+
'autoSubmit': {
|
|
47
|
+
'enabled': True,
|
|
48
|
+
'silenceDelay': 3.0,
|
|
49
|
+
'minTextLength': 15,
|
|
50
|
+
'targetApp': 'Cursor',
|
|
51
|
+
},
|
|
52
|
+
'wisprLoop': {
|
|
53
|
+
'enabled': False,
|
|
54
|
+
'ttsDelay': 4.0,
|
|
55
|
+
'silenceThreshold': 0.02,
|
|
56
|
+
'silenceDuration': 2.0,
|
|
57
|
+
'wisprHotkey': 'shift+ctrl',
|
|
58
|
+
'manualTriggerHotkey': 'ctrl+shift+l',
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
try:
|
|
62
|
+
with open(CONFIG_PATH, 'r') as f:
|
|
63
|
+
config = json.load(f)
|
|
64
|
+
for key in defaults:
|
|
65
|
+
if key in config:
|
|
66
|
+
for subkey in defaults[key]:
|
|
67
|
+
if subkey in config[key]:
|
|
68
|
+
defaults[key][subkey] = config[key][subkey]
|
|
69
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
|
70
|
+
pass
|
|
71
|
+
return defaults
|
|
72
|
+
|
|
73
|
+
config = load_config()
|
|
74
|
+
|
|
75
|
+
AUTO_SUBMIT_ENABLED = config['autoSubmit']['enabled']
|
|
76
|
+
SILENCE_DELAY = config['autoSubmit']['silenceDelay']
|
|
77
|
+
MIN_TEXT_LENGTH = config['autoSubmit']['minTextLength']
|
|
78
|
+
TARGET_APP = config['autoSubmit']['targetApp']
|
|
79
|
+
|
|
80
|
+
WISPR_LOOP_ENABLED = config['wisprLoop']['enabled']
|
|
81
|
+
TTS_DELAY = config['wisprLoop']['ttsDelay']
|
|
82
|
+
SILENCE_THRESHOLD = config['wisprLoop']['silenceThreshold']
|
|
83
|
+
SILENCE_DURATION = config['wisprLoop']['silenceDuration']
|
|
84
|
+
WISPR_HOTKEY = config['wisprLoop']['wisprHotkey']
|
|
85
|
+
MANUAL_TRIGGER_HOTKEY = config['wisprLoop']['manualTriggerHotkey']
|
|
86
|
+
|
|
87
|
+
# ─── State ───────────────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
# Auto-submit state
|
|
90
|
+
last_text = None
|
|
91
|
+
last_change_time = 0.0
|
|
92
|
+
text_at_change_start = None
|
|
93
|
+
submit_timer = None
|
|
94
|
+
monitoring = True
|
|
95
|
+
|
|
96
|
+
# Controllers
|
|
97
|
+
ctrl = Controller()
|
|
98
|
+
|
|
99
|
+
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
def get_frontmost_app():
|
|
102
|
+
"""Get the name of the currently focused application."""
|
|
103
|
+
try:
|
|
104
|
+
result = subprocess.run(
|
|
105
|
+
['osascript', '-e',
|
|
106
|
+
'tell application "System Events" to get name of first application process whose frontmost is true'],
|
|
107
|
+
capture_output=True, text=True, timeout=2
|
|
108
|
+
)
|
|
109
|
+
return result.stdout.strip()
|
|
110
|
+
except Exception:
|
|
111
|
+
return ""
|
|
112
|
+
|
|
113
|
+
def get_focused_text():
|
|
114
|
+
"""Get the text content of the currently focused UI element via Accessibility API."""
|
|
115
|
+
try:
|
|
116
|
+
system_wide = AXUIElementCreateSystemWide()
|
|
117
|
+
err, focused = AXUIElementCopyAttributeValue(
|
|
118
|
+
system_wide, "AXFocusedUIElement", None
|
|
119
|
+
)
|
|
120
|
+
if err != 0 or focused is None:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
err, value = AXUIElementCopyAttributeValue(focused, "AXValue", None)
|
|
124
|
+
if err != 0 or value is None:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
return str(value)
|
|
128
|
+
except Exception:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def parse_hotkey(hotkey_str):
|
|
132
|
+
"""Parse a hotkey string like 'shift+ctrl' into Key objects."""
|
|
133
|
+
parts = hotkey_str.lower().split('+')
|
|
134
|
+
keys = []
|
|
135
|
+
for part in parts:
|
|
136
|
+
part = part.strip()
|
|
137
|
+
if part == 'shift':
|
|
138
|
+
keys.append(Key.shift)
|
|
139
|
+
elif part == 'ctrl' or part == 'control':
|
|
140
|
+
keys.append(Key.ctrl)
|
|
141
|
+
elif part == 'alt' or part == 'option':
|
|
142
|
+
keys.append(Key.alt)
|
|
143
|
+
elif part == 'cmd' or part == 'command':
|
|
144
|
+
keys.append(Key.cmd)
|
|
145
|
+
elif len(part) == 1:
|
|
146
|
+
keys.append(KeyCode.from_char(part))
|
|
147
|
+
return keys
|
|
148
|
+
|
|
149
|
+
def press_hotkey(keys):
|
|
150
|
+
"""Press and release a hotkey combination."""
|
|
151
|
+
# Press all keys
|
|
152
|
+
for key in keys:
|
|
153
|
+
ctrl.press(key)
|
|
154
|
+
time.sleep(0.05)
|
|
155
|
+
# Release in reverse order
|
|
156
|
+
for key in reversed(keys):
|
|
157
|
+
ctrl.release(key)
|
|
158
|
+
|
|
159
|
+
def wait_for_tts_completion(timeout=15.0):
|
|
160
|
+
"""Wait for the TTS completion signal file with timeout."""
|
|
161
|
+
print(f"[wispr-loop] Waiting for TTS to complete...")
|
|
162
|
+
start_time = time.time()
|
|
163
|
+
|
|
164
|
+
# Clear any stale completion signal first
|
|
165
|
+
if os.path.exists(TTS_COMPLETE_PATH):
|
|
166
|
+
try:
|
|
167
|
+
os.remove(TTS_COMPLETE_PATH)
|
|
168
|
+
except:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
while (time.time() - start_time) < timeout:
|
|
172
|
+
if os.path.exists(TTS_COMPLETE_PATH):
|
|
173
|
+
print(f"[wispr-loop] TTS completion signal received!")
|
|
174
|
+
# Delete the completion signal
|
|
175
|
+
try:
|
|
176
|
+
os.remove(TTS_COMPLETE_PATH)
|
|
177
|
+
except:
|
|
178
|
+
pass
|
|
179
|
+
return True
|
|
180
|
+
time.sleep(0.1) # Poll every 100ms
|
|
181
|
+
|
|
182
|
+
# Timeout - proceed anyway with a warning
|
|
183
|
+
print(f"[wispr-loop] Warning: TTS completion timeout after {timeout}s, proceeding anyway...")
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
def trigger_wispr_loop():
|
|
187
|
+
"""Execute the Wispr voice loop: start Wispr, wait for silence, stop Wispr."""
|
|
188
|
+
print(f"[wispr-loop] Starting Wispr voice loop...")
|
|
189
|
+
|
|
190
|
+
# Parse Wispr hotkey
|
|
191
|
+
wispr_keys = parse_hotkey(WISPR_HOTKEY)
|
|
192
|
+
|
|
193
|
+
# Trigger Wispr to start recording
|
|
194
|
+
print(f"[wispr-loop] Pressing {WISPR_HOTKEY} to start Wispr recording...")
|
|
195
|
+
press_hotkey(wispr_keys)
|
|
196
|
+
|
|
197
|
+
# Wait for silence detection
|
|
198
|
+
print(f"[wispr-loop] Monitoring mic for silence (threshold: {SILENCE_THRESHOLD}, duration: {SILENCE_DURATION}s)...")
|
|
199
|
+
speech_detected = wait_for_silence(
|
|
200
|
+
silence_threshold=SILENCE_THRESHOLD,
|
|
201
|
+
silence_duration=SILENCE_DURATION,
|
|
202
|
+
verbose=True
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if speech_detected:
|
|
206
|
+
# User spoke, now stop Wispr (triggers paste)
|
|
207
|
+
print(f"[wispr-loop] Pressing {WISPR_HOTKEY} to stop Wispr and paste...")
|
|
208
|
+
press_hotkey(wispr_keys)
|
|
209
|
+
print(f"[wispr-loop] Wispr should paste text now, auto-submit will handle pressing Enter")
|
|
210
|
+
else:
|
|
211
|
+
print(f"[wispr-loop] No speech detected, cancelling")
|
|
212
|
+
|
|
213
|
+
# ─── Auto-Submit Monitor ─────────────────────────────────────────────────────
|
|
214
|
+
|
|
215
|
+
def do_submit(new_text_length):
|
|
216
|
+
"""Press Enter if conditions are met."""
|
|
217
|
+
global submit_timer, monitoring
|
|
218
|
+
submit_timer = None
|
|
219
|
+
|
|
220
|
+
if new_text_length < MIN_TEXT_LENGTH:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
app = get_frontmost_app()
|
|
224
|
+
if app != TARGET_APP:
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
# Briefly pause monitoring to avoid detecting our own Enter keypress
|
|
228
|
+
monitoring = False
|
|
229
|
+
print(f"[auto-submit] Dictation detected ({new_text_length} new chars), submitting...")
|
|
230
|
+
time.sleep(0.15)
|
|
231
|
+
ctrl.press(Key.enter)
|
|
232
|
+
ctrl.release(Key.enter)
|
|
233
|
+
time.sleep(0.5)
|
|
234
|
+
monitoring = True
|
|
235
|
+
|
|
236
|
+
def monitor_text_field():
|
|
237
|
+
"""Poll the focused text field for changes (auto-submit monitor)."""
|
|
238
|
+
global last_text, last_change_time, text_at_change_start, submit_timer, monitoring
|
|
239
|
+
|
|
240
|
+
while True:
|
|
241
|
+
if not AUTO_SUBMIT_ENABLED or not monitoring:
|
|
242
|
+
time.sleep(0.2)
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
current_text = get_focused_text()
|
|
247
|
+
|
|
248
|
+
if current_text is None:
|
|
249
|
+
time.sleep(0.15)
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# Detect text change
|
|
253
|
+
if current_text != last_text:
|
|
254
|
+
now = time.time()
|
|
255
|
+
|
|
256
|
+
# If this is the start of a new burst of changes, record the baseline
|
|
257
|
+
if text_at_change_start is None:
|
|
258
|
+
text_at_change_start = last_text or ""
|
|
259
|
+
|
|
260
|
+
new_chars = len(current_text) - len(text_at_change_start)
|
|
261
|
+
|
|
262
|
+
last_text = current_text
|
|
263
|
+
last_change_time = now
|
|
264
|
+
|
|
265
|
+
# Cancel any pending submit
|
|
266
|
+
if submit_timer is not None:
|
|
267
|
+
submit_timer.cancel()
|
|
268
|
+
|
|
269
|
+
# Only schedule submit if meaningful text was added
|
|
270
|
+
if new_chars >= MIN_TEXT_LENGTH:
|
|
271
|
+
submit_timer = threading.Timer(SILENCE_DELAY, do_submit, args=[new_chars])
|
|
272
|
+
submit_timer.daemon = True
|
|
273
|
+
submit_timer.start()
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
time.sleep(0.15) # Poll ~7 times per second
|
|
279
|
+
|
|
280
|
+
# ─── Wispr Loop Signal Watcher ──────────────────────────────────────────────
|
|
281
|
+
|
|
282
|
+
def watch_for_signals():
|
|
283
|
+
"""Watch for listen-signal.json and trigger Wispr loop when found."""
|
|
284
|
+
while True:
|
|
285
|
+
if not WISPR_LOOP_ENABLED:
|
|
286
|
+
time.sleep(0.5)
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
if os.path.exists(SIGNAL_PATH):
|
|
291
|
+
print(f"[wispr-loop] Listen signal detected!")
|
|
292
|
+
|
|
293
|
+
# Delete the signal file
|
|
294
|
+
os.remove(SIGNAL_PATH)
|
|
295
|
+
|
|
296
|
+
# Wait for TTS to actually finish playing
|
|
297
|
+
wait_for_tts_completion()
|
|
298
|
+
|
|
299
|
+
# Start the Wispr loop in a separate thread so we don't block
|
|
300
|
+
threading.Thread(target=trigger_wispr_loop, daemon=True).start()
|
|
301
|
+
|
|
302
|
+
except Exception as e:
|
|
303
|
+
print(f"[wispr-loop] Error: {e}")
|
|
304
|
+
|
|
305
|
+
time.sleep(0.3) # Poll for signal file every 300ms
|
|
306
|
+
|
|
307
|
+
# ─── Manual Trigger Hotkey ──────────────────────────────────────────────────
|
|
308
|
+
|
|
309
|
+
def setup_manual_trigger():
|
|
310
|
+
"""Register a global hotkey to manually trigger the Wispr loop."""
|
|
311
|
+
if not WISPR_LOOP_ENABLED:
|
|
312
|
+
return None
|
|
313
|
+
|
|
314
|
+
# Convert hotkey string to format expected by GlobalHotKeys
|
|
315
|
+
# e.g., "ctrl+shift+l" -> '<ctrl>+<shift>+l'
|
|
316
|
+
parts = MANUAL_TRIGGER_HOTKEY.lower().split('+')
|
|
317
|
+
formatted_parts = []
|
|
318
|
+
for part in parts:
|
|
319
|
+
part = part.strip()
|
|
320
|
+
if part in ['shift', 'ctrl', 'control', 'alt', 'option', 'cmd', 'command']:
|
|
321
|
+
formatted_parts.append(f'<{part}>')
|
|
322
|
+
else:
|
|
323
|
+
formatted_parts.append(part)
|
|
324
|
+
formatted_hotkey = '+'.join(formatted_parts)
|
|
325
|
+
|
|
326
|
+
def on_manual_trigger():
|
|
327
|
+
print(f"[wispr-loop] Manual trigger activated!")
|
|
328
|
+
threading.Thread(target=trigger_wispr_loop, daemon=True).start()
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
hotkeys = GlobalHotKeys({
|
|
332
|
+
formatted_hotkey: on_manual_trigger
|
|
333
|
+
})
|
|
334
|
+
hotkeys.start()
|
|
335
|
+
return hotkeys
|
|
336
|
+
except Exception as e:
|
|
337
|
+
print(f"[wispr-loop] Failed to register manual trigger hotkey: {e}")
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
341
|
+
|
|
342
|
+
def main():
|
|
343
|
+
# Check accessibility permissions
|
|
344
|
+
if not AXIsProcessTrusted():
|
|
345
|
+
print(" ERROR: Accessibility permissions not granted!")
|
|
346
|
+
print(" Go to: System Settings > Privacy & Security > Accessibility")
|
|
347
|
+
print(" Add your terminal app (Terminal, iTerm, Cursor, etc.)")
|
|
348
|
+
print()
|
|
349
|
+
print(" The script will continue but may not work correctly.")
|
|
350
|
+
print()
|
|
351
|
+
|
|
352
|
+
print(f"""
|
|
353
|
+
Cursor Auto-Submit & Wispr Voice Loop
|
|
354
|
+
──────────────────────────────────────
|
|
355
|
+
|
|
356
|
+
Auto-Submit: {'Enabled' if AUTO_SUBMIT_ENABLED else 'Disabled'}
|
|
357
|
+
Submit delay: {SILENCE_DELAY}s
|
|
358
|
+
Min text length: {MIN_TEXT_LENGTH} chars
|
|
359
|
+
Target app: {TARGET_APP}
|
|
360
|
+
|
|
361
|
+
Wispr Loop: {'Enabled' if WISPR_LOOP_ENABLED else 'Disabled'}
|
|
362
|
+
TTS delay: {TTS_DELAY}s
|
|
363
|
+
Silence thresh: {SILENCE_THRESHOLD}
|
|
364
|
+
Silence duration: {SILENCE_DURATION}s
|
|
365
|
+
Wispr hotkey: {WISPR_HOTKEY}
|
|
366
|
+
Manual trigger: {MANUAL_TRIGGER_HOTKEY}
|
|
367
|
+
|
|
368
|
+
Press Ctrl+C to stop.
|
|
369
|
+
""")
|
|
370
|
+
|
|
371
|
+
# Start monitors in separate threads
|
|
372
|
+
if AUTO_SUBMIT_ENABLED:
|
|
373
|
+
text_monitor = threading.Thread(target=monitor_text_field, daemon=True)
|
|
374
|
+
text_monitor.start()
|
|
375
|
+
print("[auto-submit] Text field monitor started")
|
|
376
|
+
|
|
377
|
+
if WISPR_LOOP_ENABLED:
|
|
378
|
+
signal_watcher = threading.Thread(target=watch_for_signals, daemon=True)
|
|
379
|
+
signal_watcher.start()
|
|
380
|
+
print("[wispr-loop] Signal watcher started")
|
|
381
|
+
|
|
382
|
+
manual_hotkey = setup_manual_trigger()
|
|
383
|
+
if manual_hotkey:
|
|
384
|
+
print(f"[wispr-loop] Manual trigger registered: {MANUAL_TRIGGER_HOTKEY}")
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
# Keep main thread alive
|
|
388
|
+
while True:
|
|
389
|
+
time.sleep(1)
|
|
390
|
+
except KeyboardInterrupt:
|
|
391
|
+
print("\n[main] Stopped.")
|
|
392
|
+
|
|
393
|
+
if __name__ == '__main__':
|
|
394
|
+
main()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Silence Detector Module
|
|
4
|
+
|
|
5
|
+
Monitors microphone input and detects when the user has stopped speaking
|
|
6
|
+
based on RMS (root mean square) amplitude analysis.
|
|
7
|
+
|
|
8
|
+
State machine:
|
|
9
|
+
IDLE -> SPEECH (RMS exceeds threshold)
|
|
10
|
+
SPEECH -> TRAILING_SILENCE (RMS drops below threshold)
|
|
11
|
+
TRAILING_SILENCE -> SPEECH (RMS exceeds threshold again)
|
|
12
|
+
TRAILING_SILENCE -> DONE (silence duration exceeded)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import sounddevice as sd
|
|
17
|
+
import time
|
|
18
|
+
from enum import Enum
|
|
19
|
+
|
|
20
|
+
class State(Enum):
|
|
21
|
+
IDLE = "idle"
|
|
22
|
+
SPEECH = "speech"
|
|
23
|
+
TRAILING_SILENCE = "trailing_silence"
|
|
24
|
+
DONE = "done"
|
|
25
|
+
|
|
26
|
+
class SilenceDetector:
|
|
27
|
+
def __init__(self,
|
|
28
|
+
silence_threshold=0.02,
|
|
29
|
+
silence_duration=2.0,
|
|
30
|
+
sample_rate=16000,
|
|
31
|
+
chunk_size=1024):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the silence detector.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
silence_threshold: RMS amplitude threshold for speech detection
|
|
37
|
+
silence_duration: Seconds of silence needed to confirm user stopped
|
|
38
|
+
sample_rate: Audio sample rate in Hz
|
|
39
|
+
chunk_size: Number of samples per chunk
|
|
40
|
+
"""
|
|
41
|
+
self.silence_threshold = silence_threshold
|
|
42
|
+
self.silence_duration = silence_duration
|
|
43
|
+
self.sample_rate = sample_rate
|
|
44
|
+
self.chunk_size = chunk_size
|
|
45
|
+
|
|
46
|
+
self.state = State.IDLE
|
|
47
|
+
self.silence_start_time = None
|
|
48
|
+
self.speech_detected = False
|
|
49
|
+
|
|
50
|
+
def compute_rms(self, audio_chunk):
|
|
51
|
+
"""Compute the RMS (root mean square) of an audio chunk."""
|
|
52
|
+
return np.sqrt(np.mean(audio_chunk ** 2))
|
|
53
|
+
|
|
54
|
+
def wait_for_silence(self, verbose=True):
|
|
55
|
+
"""
|
|
56
|
+
Monitor the microphone and wait for silence after detecting speech.
|
|
57
|
+
|
|
58
|
+
Returns when the user has stopped speaking (silence_duration exceeded).
|
|
59
|
+
"""
|
|
60
|
+
if verbose:
|
|
61
|
+
print(f"[silence-detector] Listening for speech...")
|
|
62
|
+
print(f"[silence-detector] Threshold: {self.silence_threshold:.4f}, Silence duration: {self.silence_duration}s")
|
|
63
|
+
|
|
64
|
+
# Open the microphone stream
|
|
65
|
+
with sd.InputStream(samplerate=self.sample_rate,
|
|
66
|
+
channels=1,
|
|
67
|
+
blocksize=self.chunk_size) as stream:
|
|
68
|
+
|
|
69
|
+
while self.state != State.DONE:
|
|
70
|
+
# Read audio chunk
|
|
71
|
+
audio_data, overflowed = stream.read(self.chunk_size)
|
|
72
|
+
|
|
73
|
+
if overflowed:
|
|
74
|
+
print("[silence-detector] Warning: Audio buffer overflow")
|
|
75
|
+
|
|
76
|
+
# Compute RMS amplitude
|
|
77
|
+
rms = self.compute_rms(audio_data.flatten())
|
|
78
|
+
|
|
79
|
+
# State machine transitions
|
|
80
|
+
if self.state == State.IDLE:
|
|
81
|
+
if rms > self.silence_threshold:
|
|
82
|
+
self.state = State.SPEECH
|
|
83
|
+
self.speech_detected = True
|
|
84
|
+
if verbose:
|
|
85
|
+
print(f"[silence-detector] Speech detected (RMS: {rms:.4f})")
|
|
86
|
+
|
|
87
|
+
elif self.state == State.SPEECH:
|
|
88
|
+
if rms < self.silence_threshold:
|
|
89
|
+
self.state = State.TRAILING_SILENCE
|
|
90
|
+
self.silence_start_time = time.time()
|
|
91
|
+
if verbose:
|
|
92
|
+
print(f"[silence-detector] Trailing silence started...")
|
|
93
|
+
# else: still speaking, remain in SPEECH state
|
|
94
|
+
|
|
95
|
+
elif self.state == State.TRAILING_SILENCE:
|
|
96
|
+
if rms > self.silence_threshold:
|
|
97
|
+
# User started speaking again
|
|
98
|
+
self.state = State.SPEECH
|
|
99
|
+
self.silence_start_time = None
|
|
100
|
+
if verbose:
|
|
101
|
+
print(f"[silence-detector] Speech resumed (RMS: {rms:.4f})")
|
|
102
|
+
else:
|
|
103
|
+
# Check if silence duration exceeded
|
|
104
|
+
elapsed = time.time() - self.silence_start_time
|
|
105
|
+
if elapsed >= self.silence_duration:
|
|
106
|
+
self.state = State.DONE
|
|
107
|
+
if verbose:
|
|
108
|
+
print(f"[silence-detector] Silence confirmed ({elapsed:.1f}s)")
|
|
109
|
+
|
|
110
|
+
if verbose:
|
|
111
|
+
print("[silence-detector] Done")
|
|
112
|
+
|
|
113
|
+
return self.speech_detected
|
|
114
|
+
|
|
115
|
+
def wait_for_silence(silence_threshold=0.02,
|
|
116
|
+
silence_duration=2.0,
|
|
117
|
+
verbose=True):
|
|
118
|
+
"""
|
|
119
|
+
Convenience function to detect silence.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
bool: True if speech was detected, False if no speech
|
|
123
|
+
"""
|
|
124
|
+
detector = SilenceDetector(
|
|
125
|
+
silence_threshold=silence_threshold,
|
|
126
|
+
silence_duration=silence_duration
|
|
127
|
+
)
|
|
128
|
+
return detector.wait_for_silence(verbose=verbose)
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
# Test the detector
|
|
132
|
+
print("Silence Detector Test")
|
|
133
|
+
print("=====================")
|
|
134
|
+
print("Start speaking into your microphone...")
|
|
135
|
+
print()
|
|
136
|
+
|
|
137
|
+
speech_detected = wait_for_silence(
|
|
138
|
+
silence_threshold=0.02,
|
|
139
|
+
silence_duration=2.0,
|
|
140
|
+
verbose=True
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if speech_detected:
|
|
144
|
+
print("\n✓ Speech was detected and silence confirmed")
|
|
145
|
+
else:
|
|
146
|
+
print("\n✗ No speech detected")
|