@staff0rd/assist 0.78.0 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -0
- package/claude/commands/comment.md +39 -0
- package/claude/commands/voice-logs.md +5 -0
- package/claude/commands/voice-setup.md +5 -0
- package/claude/commands/voice-start.md +5 -0
- package/claude/commands/voice-status.md +5 -0
- package/claude/commands/voice-stop.md +5 -0
- package/claude/settings.json +11 -0
- package/dist/commands/voice/python/audio_capture.py +49 -0
- package/dist/commands/voice/python/dispatch.py +14 -0
- package/dist/commands/voice/python/keyboard.py +73 -0
- package/dist/commands/voice/python/list_devices.py +20 -0
- package/dist/commands/voice/python/logger.py +38 -0
- package/dist/commands/voice/python/pyproject.toml +34 -0
- package/dist/commands/voice/python/setup_models.py +91 -0
- package/dist/commands/voice/python/smart_turn.py +63 -0
- package/dist/commands/voice/python/stt.py +51 -0
- package/dist/commands/voice/python/uv.lock +5947 -0
- package/dist/commands/voice/python/vad.py +50 -0
- package/dist/commands/voice/python/voice_daemon.py +362 -0
- package/dist/commands/voice/python/wake_word.py +26 -0
- package/dist/index.js +562 -179
- package/package.json +2 -2
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Silero VAD wrapper (ONNX)."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import onnxruntime as ort
|
|
7
|
+
|
|
8
|
+
from logger import log
|
|
9
|
+
|
|
10
|
+
DEFAULT_THRESHOLD = 0.5
|
|
11
|
+
CONTEXT_SIZE = 64 # v5/v6 requires 64 context samples prepended at 16kHz
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SileroVAD:
|
|
15
|
+
def __init__(self):
|
|
16
|
+
model_path = os.environ.get("VOICE_MODEL_VAD")
|
|
17
|
+
if not model_path:
|
|
18
|
+
models_dir = os.environ.get(
|
|
19
|
+
"VOICE_MODELS_DIR",
|
|
20
|
+
os.path.expanduser("~/.assist/voice/models"),
|
|
21
|
+
)
|
|
22
|
+
model_path = os.path.join(models_dir, "silero_vad.onnx")
|
|
23
|
+
|
|
24
|
+
log("vad_init", f"model={model_path}")
|
|
25
|
+
self._session = ort.InferenceSession(
|
|
26
|
+
model_path, providers=["CPUExecutionProvider"]
|
|
27
|
+
)
|
|
28
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
|
29
|
+
self._context = np.zeros(CONTEXT_SIZE, dtype=np.float32)
|
|
30
|
+
self._sample_rate = np.array(16000, dtype=np.int64)
|
|
31
|
+
self.threshold = DEFAULT_THRESHOLD
|
|
32
|
+
|
|
33
|
+
def process(self, audio: np.ndarray) -> float:
|
|
34
|
+
"""Process a chunk of audio, return speech probability."""
|
|
35
|
+
chunk = audio.astype(np.float32)
|
|
36
|
+
# Prepend context (last 64 samples from previous chunk)
|
|
37
|
+
input_data = np.concatenate([self._context, chunk]).reshape(1, -1)
|
|
38
|
+
ort_inputs = {
|
|
39
|
+
"input": input_data,
|
|
40
|
+
"state": self._state,
|
|
41
|
+
"sr": self._sample_rate,
|
|
42
|
+
}
|
|
43
|
+
out, state = self._session.run(None, ort_inputs)
|
|
44
|
+
self._state = state
|
|
45
|
+
self._context = chunk[-CONTEXT_SIZE:]
|
|
46
|
+
return float(out[0][0])
|
|
47
|
+
|
|
48
|
+
def reset(self) -> None:
|
|
49
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
|
50
|
+
self._context = np.zeros(CONTEXT_SIZE, dtype=np.float32)
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""Voice daemon entry point — main loop and signal handling."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import signal
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from audio_capture import AudioCapture, BLOCK_SIZE
|
|
11
|
+
from logger import DEBUG, log
|
|
12
|
+
from smart_turn import SmartTurn
|
|
13
|
+
from stt import ParakeetSTT
|
|
14
|
+
from vad import SileroVAD
|
|
15
|
+
from wake_word import check_wake_word
|
|
16
|
+
|
|
17
|
+
import keyboard
|
|
18
|
+
|
|
19
|
+
# States
|
|
20
|
+
IDLE = "idle"
|
|
21
|
+
LISTENING = "listening"
|
|
22
|
+
ACTIVATED = "activated" # wake word heard (alone), waiting for command utterance
|
|
23
|
+
|
|
24
|
+
# Max seconds of speech before forced processing
|
|
25
|
+
MAX_SPEECH_SECONDS = 30
|
|
26
|
+
|
|
27
|
+
# How often (in samples) to run partial STT during speech
|
|
28
|
+
PARTIAL_STT_INTERVAL = 16000 # every 1 second of audio
|
|
29
|
+
|
|
30
|
+
# Trailing silence (in ms) required before sending segment to smart turn.
|
|
31
|
+
# Matches the reference implementation (record_and_predict.py STOP_MS=1000).
|
|
32
|
+
STOP_MS = 1000
|
|
33
|
+
STOP_CHUNKS = (STOP_MS * 16000) // (BLOCK_SIZE * 1000) # ~31 chunks
|
|
34
|
+
|
|
35
|
+
# How long (seconds) to wait for a command after a wake-word-only utterance
|
|
36
|
+
ACTIVATED_TIMEOUT = 10.0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _print_vad_bar(
|
|
40
|
+
prob: float, threshold: float, state: str, chunk: np.ndarray
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Print a live VAD meter to stderr when debug mode is on."""
|
|
43
|
+
rms = float(np.sqrt(np.mean(chunk**2)))
|
|
44
|
+
peak = float(np.max(np.abs(chunk)))
|
|
45
|
+
width = 40
|
|
46
|
+
filled = int(prob * width)
|
|
47
|
+
bar = "█" * filled + "░" * (width - filled)
|
|
48
|
+
marker = ">" if prob > threshold else " "
|
|
49
|
+
print(
|
|
50
|
+
f"\r {marker} VAD {prob:.2f} [{bar}] "
|
|
51
|
+
f"rms={rms:.4f} peak={peak:.4f} {state:10s}",
|
|
52
|
+
end="",
|
|
53
|
+
file=sys.stderr,
|
|
54
|
+
flush=True,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class VoiceDaemon:
|
|
59
|
+
def __init__(self):
|
|
60
|
+
self._running = True
|
|
61
|
+
self._state = IDLE
|
|
62
|
+
self._audio_buffer: list[np.ndarray] = []
|
|
63
|
+
|
|
64
|
+
log("daemon_init", "Initializing models...")
|
|
65
|
+
self._mic = AudioCapture()
|
|
66
|
+
self._vad = SileroVAD()
|
|
67
|
+
self._smart_turn = SmartTurn()
|
|
68
|
+
self._stt = ParakeetSTT()
|
|
69
|
+
log("daemon_ready")
|
|
70
|
+
|
|
71
|
+
# Incremental typing state
|
|
72
|
+
self._wake_detected = False
|
|
73
|
+
self._typed_text = ""
|
|
74
|
+
self._last_partial_at = 0
|
|
75
|
+
self._activated_at = 0.0
|
|
76
|
+
|
|
77
|
+
def _handle_signal(self, signum, frame) -> None:
|
|
78
|
+
log("daemon_signal", f"Received signal {signum}")
|
|
79
|
+
self._running = False
|
|
80
|
+
|
|
81
|
+
def _run_partial_stt(self) -> None:
|
|
82
|
+
"""Run STT on accumulated audio and type incrementally."""
|
|
83
|
+
if not self._audio_buffer:
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
audio = np.concatenate(self._audio_buffer)
|
|
87
|
+
text = self._stt.transcribe(audio)
|
|
88
|
+
if not text.strip():
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
if DEBUG:
|
|
92
|
+
print(f"\n Partial: {text}", file=sys.stderr)
|
|
93
|
+
|
|
94
|
+
if self._state == ACTIVATED:
|
|
95
|
+
# Already activated — everything is the command, no wake word needed
|
|
96
|
+
if text.strip() != self._typed_text:
|
|
97
|
+
if self._typed_text:
|
|
98
|
+
self._update_typed_text(text.strip())
|
|
99
|
+
else:
|
|
100
|
+
keyboard.type_text(text.strip())
|
|
101
|
+
self._typed_text = text.strip()
|
|
102
|
+
elif not self._wake_detected:
|
|
103
|
+
found, command = check_wake_word(text)
|
|
104
|
+
if found and command:
|
|
105
|
+
self._wake_detected = True
|
|
106
|
+
log("wake_word_detected", command)
|
|
107
|
+
if DEBUG:
|
|
108
|
+
print(f" Wake word! Typing: {command}", file=sys.stderr)
|
|
109
|
+
keyboard.type_text(command)
|
|
110
|
+
self._typed_text = command
|
|
111
|
+
else:
|
|
112
|
+
found, command = check_wake_word(text)
|
|
113
|
+
if found and command and command != self._typed_text:
|
|
114
|
+
self._update_typed_text(command)
|
|
115
|
+
|
|
116
|
+
def _update_typed_text(self, new_text: str) -> None:
|
|
117
|
+
"""Diff old typed text vs new, backspace + type the difference."""
|
|
118
|
+
old = self._typed_text
|
|
119
|
+
# Find common prefix
|
|
120
|
+
common = 0
|
|
121
|
+
for a, b in zip(old, new_text):
|
|
122
|
+
if a == b:
|
|
123
|
+
common += 1
|
|
124
|
+
else:
|
|
125
|
+
break
|
|
126
|
+
# Backspace what's wrong, type new suffix
|
|
127
|
+
to_delete = len(old) - common
|
|
128
|
+
to_type = new_text[common:]
|
|
129
|
+
if to_delete > 0:
|
|
130
|
+
keyboard.backspace(to_delete)
|
|
131
|
+
if to_type:
|
|
132
|
+
keyboard.type_text(to_type)
|
|
133
|
+
self._typed_text = new_text
|
|
134
|
+
|
|
135
|
+
def _check_segment_end(self, sample_count: int, trailing_silence: int) -> bool:
|
|
136
|
+
"""Check if the current segment is done.
|
|
137
|
+
|
|
138
|
+
Follows the reference smart-turn implementation:
|
|
139
|
+
1. Accumulate speech + trailing silence.
|
|
140
|
+
2. After STOP_MS of continuous silence, send the full segment to smart turn.
|
|
141
|
+
3. If smart turn says "Incomplete", keep listening (return False).
|
|
142
|
+
4. If smart turn says "Complete", finalize (return True).
|
|
143
|
+
5. Hard cap at MAX_SPEECH_SECONDS always finalizes.
|
|
144
|
+
"""
|
|
145
|
+
max_samples = MAX_SPEECH_SECONDS * 16000
|
|
146
|
+
|
|
147
|
+
if sample_count >= max_samples:
|
|
148
|
+
log("max_speech", "Reached max speech duration")
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
if trailing_silence >= STOP_CHUNKS:
|
|
152
|
+
audio_so_far = np.concatenate(self._audio_buffer)
|
|
153
|
+
is_complete = self._smart_turn.is_end_of_turn(audio_so_far)
|
|
154
|
+
if DEBUG:
|
|
155
|
+
label = "Complete" if is_complete else "Incomplete"
|
|
156
|
+
print(f"\n Smart turn: {label}", file=sys.stderr)
|
|
157
|
+
if is_complete:
|
|
158
|
+
return True
|
|
159
|
+
else:
|
|
160
|
+
log("smart_turn_incomplete", "Continuing to listen...")
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
def _finalize_utterance(self) -> None:
|
|
164
|
+
"""End of turn: final STT, correct typed text, press Enter."""
|
|
165
|
+
if not self._audio_buffer:
|
|
166
|
+
self._reset_listening()
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
audio = np.concatenate(self._audio_buffer)
|
|
170
|
+
duration = len(audio) / 16000
|
|
171
|
+
log("end_of_turn", f"audio_length={duration:.1f}s")
|
|
172
|
+
|
|
173
|
+
if DEBUG:
|
|
174
|
+
print(file=sys.stderr)
|
|
175
|
+
|
|
176
|
+
text = self._stt.transcribe(audio)
|
|
177
|
+
|
|
178
|
+
if self._state == ACTIVATED:
|
|
179
|
+
# Activated mode — full text is the command
|
|
180
|
+
command = text.strip()
|
|
181
|
+
if command:
|
|
182
|
+
if command != self._typed_text:
|
|
183
|
+
if self._typed_text:
|
|
184
|
+
self._update_typed_text(command)
|
|
185
|
+
else:
|
|
186
|
+
keyboard.type_text(command)
|
|
187
|
+
log("dispatch_enter", command)
|
|
188
|
+
if DEBUG:
|
|
189
|
+
print(f" Final: {command} [Enter]", file=sys.stderr)
|
|
190
|
+
keyboard.press_enter()
|
|
191
|
+
else:
|
|
192
|
+
if self._typed_text:
|
|
193
|
+
keyboard.backspace(len(self._typed_text))
|
|
194
|
+
log("dispatch_cancelled", "Empty command in activated mode")
|
|
195
|
+
self._reset_listening()
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
if self._wake_detected:
|
|
199
|
+
# Correct final text and submit
|
|
200
|
+
found, command = check_wake_word(text)
|
|
201
|
+
if found and command:
|
|
202
|
+
if command != self._typed_text:
|
|
203
|
+
self._update_typed_text(command)
|
|
204
|
+
log("dispatch_enter", command)
|
|
205
|
+
if DEBUG:
|
|
206
|
+
print(f" Final: {command} [Enter]", file=sys.stderr)
|
|
207
|
+
keyboard.press_enter()
|
|
208
|
+
elif self._typed_text:
|
|
209
|
+
# Wake word but no command — clear what we typed
|
|
210
|
+
keyboard.backspace(len(self._typed_text))
|
|
211
|
+
log("dispatch_cancelled", "No command after wake word")
|
|
212
|
+
else:
|
|
213
|
+
# Check final transcription for wake word
|
|
214
|
+
found, command = check_wake_word(text)
|
|
215
|
+
if found and command:
|
|
216
|
+
log("wake_word_detected", command)
|
|
217
|
+
if DEBUG:
|
|
218
|
+
print(f" Wake word! Final: {command} [Enter]", file=sys.stderr)
|
|
219
|
+
keyboard.type_text(command)
|
|
220
|
+
keyboard.press_enter()
|
|
221
|
+
elif found:
|
|
222
|
+
# Wake word only — enter ACTIVATED state for next utterance
|
|
223
|
+
log("wake_word_only", "Listening for command...")
|
|
224
|
+
if DEBUG:
|
|
225
|
+
print(" Wake word heard — listening for command...", file=sys.stderr)
|
|
226
|
+
self._audio_buffer.clear()
|
|
227
|
+
self._vad.reset()
|
|
228
|
+
self._wake_detected = False
|
|
229
|
+
self._typed_text = ""
|
|
230
|
+
self._last_partial_at = 0
|
|
231
|
+
self._activated_at = time.monotonic()
|
|
232
|
+
self._state = ACTIVATED
|
|
233
|
+
return # don't reset to IDLE
|
|
234
|
+
else:
|
|
235
|
+
log("no_wake_word", text)
|
|
236
|
+
if DEBUG:
|
|
237
|
+
print(f" No wake word: {text}", file=sys.stderr)
|
|
238
|
+
|
|
239
|
+
self._reset_listening()
|
|
240
|
+
|
|
241
|
+
def _reset_listening(self) -> None:
|
|
242
|
+
self._audio_buffer.clear()
|
|
243
|
+
self._vad.reset()
|
|
244
|
+
self._wake_detected = False
|
|
245
|
+
self._typed_text = ""
|
|
246
|
+
self._last_partial_at = 0
|
|
247
|
+
self._activated_at = 0.0
|
|
248
|
+
self._state = IDLE
|
|
249
|
+
|
|
250
|
+
def run(self) -> None:
|
|
251
|
+
signal.signal(signal.SIGTERM, self._handle_signal)
|
|
252
|
+
signal.signal(signal.SIGINT, self._handle_signal)
|
|
253
|
+
|
|
254
|
+
log("daemon_start", "Starting audio capture...")
|
|
255
|
+
self._mic.start()
|
|
256
|
+
|
|
257
|
+
if DEBUG:
|
|
258
|
+
print("Listening... (Ctrl+C to stop)", file=sys.stderr)
|
|
259
|
+
|
|
260
|
+
sample_count = 0
|
|
261
|
+
trailing_silence = 0
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
while self._running:
|
|
265
|
+
chunk = self._mic.read(timeout=0.5)
|
|
266
|
+
if chunk is None:
|
|
267
|
+
if self._state == ACTIVATED and not self._audio_buffer:
|
|
268
|
+
if time.monotonic() - self._activated_at > ACTIVATED_TIMEOUT:
|
|
269
|
+
log("activated_timeout", "No command received")
|
|
270
|
+
if DEBUG:
|
|
271
|
+
print("\n Activation timed out", file=sys.stderr)
|
|
272
|
+
self._reset_listening()
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
prob = self._vad.process(chunk)
|
|
276
|
+
|
|
277
|
+
if DEBUG:
|
|
278
|
+
_print_vad_bar(prob, self._vad.threshold, self._state, chunk)
|
|
279
|
+
|
|
280
|
+
if self._state == IDLE:
|
|
281
|
+
if prob > self._vad.threshold:
|
|
282
|
+
log("speech_start")
|
|
283
|
+
self._state = LISTENING
|
|
284
|
+
self._audio_buffer.append(chunk)
|
|
285
|
+
sample_count = len(chunk)
|
|
286
|
+
trailing_silence = 0
|
|
287
|
+
self._last_partial_at = 0
|
|
288
|
+
|
|
289
|
+
elif self._state == ACTIVATED:
|
|
290
|
+
# Check timeout (only before speech starts)
|
|
291
|
+
if not self._audio_buffer:
|
|
292
|
+
if time.monotonic() - self._activated_at > ACTIVATED_TIMEOUT:
|
|
293
|
+
log("activated_timeout", "No command received")
|
|
294
|
+
if DEBUG:
|
|
295
|
+
print("\n Activation timed out", file=sys.stderr)
|
|
296
|
+
self._reset_listening()
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
if prob > self._vad.threshold and not self._audio_buffer:
|
|
300
|
+
log("speech_start", "command after activation")
|
|
301
|
+
|
|
302
|
+
if prob > self._vad.threshold or self._audio_buffer:
|
|
303
|
+
self._audio_buffer.append(chunk)
|
|
304
|
+
sample_count += len(chunk)
|
|
305
|
+
|
|
306
|
+
if prob > self._vad.threshold:
|
|
307
|
+
trailing_silence = 0
|
|
308
|
+
else:
|
|
309
|
+
trailing_silence += 1
|
|
310
|
+
|
|
311
|
+
# Periodic STT for incremental typing
|
|
312
|
+
if sample_count - self._last_partial_at >= PARTIAL_STT_INTERVAL:
|
|
313
|
+
self._last_partial_at = sample_count
|
|
314
|
+
self._run_partial_stt()
|
|
315
|
+
|
|
316
|
+
if self._check_segment_end(sample_count, trailing_silence):
|
|
317
|
+
self._finalize_utterance()
|
|
318
|
+
sample_count = 0
|
|
319
|
+
trailing_silence = 0
|
|
320
|
+
|
|
321
|
+
elif self._state == LISTENING:
|
|
322
|
+
self._audio_buffer.append(chunk)
|
|
323
|
+
sample_count += len(chunk)
|
|
324
|
+
|
|
325
|
+
if prob > self._vad.threshold:
|
|
326
|
+
trailing_silence = 0
|
|
327
|
+
else:
|
|
328
|
+
trailing_silence += 1
|
|
329
|
+
|
|
330
|
+
# Periodic STT for incremental typing
|
|
331
|
+
if sample_count - self._last_partial_at >= PARTIAL_STT_INTERVAL:
|
|
332
|
+
self._last_partial_at = sample_count
|
|
333
|
+
self._run_partial_stt()
|
|
334
|
+
|
|
335
|
+
if self._check_segment_end(sample_count, trailing_silence):
|
|
336
|
+
self._finalize_utterance()
|
|
337
|
+
sample_count = 0
|
|
338
|
+
trailing_silence = 0
|
|
339
|
+
|
|
340
|
+
finally:
|
|
341
|
+
if DEBUG:
|
|
342
|
+
print(file=sys.stderr)
|
|
343
|
+
self._mic.stop()
|
|
344
|
+
log("daemon_stop", "Voice daemon stopped")
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def main() -> None:
|
|
348
|
+
log("daemon_launch", f"PID={os.getpid()}")
|
|
349
|
+
try:
|
|
350
|
+
daemon = VoiceDaemon()
|
|
351
|
+
daemon.run()
|
|
352
|
+
except Exception as e:
|
|
353
|
+
log("daemon_crash", str(e), level="error")
|
|
354
|
+
if DEBUG:
|
|
355
|
+
import traceback
|
|
356
|
+
|
|
357
|
+
traceback.print_exc(file=sys.stderr)
|
|
358
|
+
sys.exit(1)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
if __name__ == "__main__":
|
|
362
|
+
main()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Keyword detection in transcribed text."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from logger import log
|
|
6
|
+
|
|
7
|
+
DEFAULT_WAKE_WORDS = ["hi claude"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_wake_words() -> list[str]:
|
|
11
|
+
env = os.environ.get("VOICE_WAKE_WORDS", "")
|
|
12
|
+
if env:
|
|
13
|
+
return [w.strip().lower() for w in env.split(",") if w.strip()]
|
|
14
|
+
return DEFAULT_WAKE_WORDS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def check_wake_word(text: str) -> tuple[bool, str]:
|
|
18
|
+
"""Check if text contains a wake word. Returns (found, remaining_text)."""
|
|
19
|
+
lower = text.lower()
|
|
20
|
+
for word in get_wake_words():
|
|
21
|
+
idx = lower.find(word)
|
|
22
|
+
if idx != -1:
|
|
23
|
+
remaining = text[idx + len(word) :].strip().lstrip(",").strip()
|
|
24
|
+
log("wake_word_detected", word, remaining=remaining)
|
|
25
|
+
return True, remaining
|
|
26
|
+
return False, text
|