@staff0rd/assist 0.78.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ """Silero VAD wrapper (ONNX)."""
2
+
3
+ import os
4
+
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+
8
+ from logger import log
9
+
10
+ DEFAULT_THRESHOLD = 0.5
11
+ CONTEXT_SIZE = 64 # v5/v6 requires 64 context samples prepended at 16kHz
12
+
13
+
14
+ class SileroVAD:
15
+ def __init__(self):
16
+ model_path = os.environ.get("VOICE_MODEL_VAD")
17
+ if not model_path:
18
+ models_dir = os.environ.get(
19
+ "VOICE_MODELS_DIR",
20
+ os.path.expanduser("~/.assist/voice/models"),
21
+ )
22
+ model_path = os.path.join(models_dir, "silero_vad.onnx")
23
+
24
+ log("vad_init", f"model={model_path}")
25
+ self._session = ort.InferenceSession(
26
+ model_path, providers=["CPUExecutionProvider"]
27
+ )
28
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
29
+ self._context = np.zeros(CONTEXT_SIZE, dtype=np.float32)
30
+ self._sample_rate = np.array(16000, dtype=np.int64)
31
+ self.threshold = DEFAULT_THRESHOLD
32
+
33
+ def process(self, audio: np.ndarray) -> float:
34
+ """Process a chunk of audio, return speech probability."""
35
+ chunk = audio.astype(np.float32)
36
+ # Prepend context (last 64 samples from previous chunk)
37
+ input_data = np.concatenate([self._context, chunk]).reshape(1, -1)
38
+ ort_inputs = {
39
+ "input": input_data,
40
+ "state": self._state,
41
+ "sr": self._sample_rate,
42
+ }
43
+ out, state = self._session.run(None, ort_inputs)
44
+ self._state = state
45
+ self._context = chunk[-CONTEXT_SIZE:]
46
+ return float(out[0][0])
47
+
48
+ def reset(self) -> None:
49
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
50
+ self._context = np.zeros(CONTEXT_SIZE, dtype=np.float32)
@@ -0,0 +1,362 @@
1
+ """Voice daemon entry point — main loop and signal handling."""
2
+
3
+ import os
4
+ import signal
5
+ import sys
6
+ import time
7
+
8
+ import numpy as np
9
+
10
+ from audio_capture import AudioCapture, BLOCK_SIZE
11
+ from logger import DEBUG, log
12
+ from smart_turn import SmartTurn
13
+ from stt import ParakeetSTT
14
+ from vad import SileroVAD
15
+ from wake_word import check_wake_word
16
+
17
+ import keyboard
18
+
19
+ # States
20
+ IDLE = "idle"
21
+ LISTENING = "listening"
22
+ ACTIVATED = "activated" # wake word heard (alone), waiting for command utterance
23
+
24
+ # Max seconds of speech before forced processing
25
+ MAX_SPEECH_SECONDS = 30
26
+
27
+ # How often (in samples) to run partial STT during speech
28
+ PARTIAL_STT_INTERVAL = 16000 # every 1 second of audio
29
+
30
+ # Trailing silence (in ms) required before sending segment to smart turn.
31
+ # Matches the reference implementation (record_and_predict.py STOP_MS=1000).
32
+ STOP_MS = 1000
33
+ STOP_CHUNKS = (STOP_MS * 16000) // (BLOCK_SIZE * 1000) # ~31 chunks
34
+
35
+ # How long (seconds) to wait for a command after a wake-word-only utterance
36
+ ACTIVATED_TIMEOUT = 10.0
37
+
38
+
39
+ def _print_vad_bar(
40
+ prob: float, threshold: float, state: str, chunk: np.ndarray
41
+ ) -> None:
42
+ """Print a live VAD meter to stderr when debug mode is on."""
43
+ rms = float(np.sqrt(np.mean(chunk**2)))
44
+ peak = float(np.max(np.abs(chunk)))
45
+ width = 40
46
+ filled = int(prob * width)
47
+ bar = "█" * filled + "░" * (width - filled)
48
+ marker = ">" if prob > threshold else " "
49
+ print(
50
+ f"\r {marker} VAD {prob:.2f} [{bar}] "
51
+ f"rms={rms:.4f} peak={peak:.4f} {state:10s}",
52
+ end="",
53
+ file=sys.stderr,
54
+ flush=True,
55
+ )
56
+
57
+
58
+ class VoiceDaemon:
59
+ def __init__(self):
60
+ self._running = True
61
+ self._state = IDLE
62
+ self._audio_buffer: list[np.ndarray] = []
63
+
64
+ log("daemon_init", "Initializing models...")
65
+ self._mic = AudioCapture()
66
+ self._vad = SileroVAD()
67
+ self._smart_turn = SmartTurn()
68
+ self._stt = ParakeetSTT()
69
+ log("daemon_ready")
70
+
71
+ # Incremental typing state
72
+ self._wake_detected = False
73
+ self._typed_text = ""
74
+ self._last_partial_at = 0
75
+ self._activated_at = 0.0
76
+
77
+ def _handle_signal(self, signum, frame) -> None:
78
+ log("daemon_signal", f"Received signal {signum}")
79
+ self._running = False
80
+
81
+ def _run_partial_stt(self) -> None:
82
+ """Run STT on accumulated audio and type incrementally."""
83
+ if not self._audio_buffer:
84
+ return
85
+
86
+ audio = np.concatenate(self._audio_buffer)
87
+ text = self._stt.transcribe(audio)
88
+ if not text.strip():
89
+ return
90
+
91
+ if DEBUG:
92
+ print(f"\n Partial: {text}", file=sys.stderr)
93
+
94
+ if self._state == ACTIVATED:
95
+ # Already activated — everything is the command, no wake word needed
96
+ if text.strip() != self._typed_text:
97
+ if self._typed_text:
98
+ self._update_typed_text(text.strip())
99
+ else:
100
+ keyboard.type_text(text.strip())
101
+ self._typed_text = text.strip()
102
+ elif not self._wake_detected:
103
+ found, command = check_wake_word(text)
104
+ if found and command:
105
+ self._wake_detected = True
106
+ log("wake_word_detected", command)
107
+ if DEBUG:
108
+ print(f" Wake word! Typing: {command}", file=sys.stderr)
109
+ keyboard.type_text(command)
110
+ self._typed_text = command
111
+ else:
112
+ found, command = check_wake_word(text)
113
+ if found and command and command != self._typed_text:
114
+ self._update_typed_text(command)
115
+
116
+ def _update_typed_text(self, new_text: str) -> None:
117
+ """Diff old typed text vs new, backspace + type the difference."""
118
+ old = self._typed_text
119
+ # Find common prefix
120
+ common = 0
121
+ for a, b in zip(old, new_text):
122
+ if a == b:
123
+ common += 1
124
+ else:
125
+ break
126
+ # Backspace what's wrong, type new suffix
127
+ to_delete = len(old) - common
128
+ to_type = new_text[common:]
129
+ if to_delete > 0:
130
+ keyboard.backspace(to_delete)
131
+ if to_type:
132
+ keyboard.type_text(to_type)
133
+ self._typed_text = new_text
134
+
135
+ def _check_segment_end(self, sample_count: int, trailing_silence: int) -> bool:
136
+ """Check if the current segment is done.
137
+
138
+ Follows the reference smart-turn implementation:
139
+ 1. Accumulate speech + trailing silence.
140
+ 2. After STOP_MS of continuous silence, send the full segment to smart turn.
141
+ 3. If smart turn says "Incomplete", keep listening (return False).
142
+ 4. If smart turn says "Complete", finalize (return True).
143
+ 5. Hard cap at MAX_SPEECH_SECONDS always finalizes.
144
+ """
145
+ max_samples = MAX_SPEECH_SECONDS * 16000
146
+
147
+ if sample_count >= max_samples:
148
+ log("max_speech", "Reached max speech duration")
149
+ return True
150
+
151
+ if trailing_silence >= STOP_CHUNKS:
152
+ audio_so_far = np.concatenate(self._audio_buffer)
153
+ is_complete = self._smart_turn.is_end_of_turn(audio_so_far)
154
+ if DEBUG:
155
+ label = "Complete" if is_complete else "Incomplete"
156
+ print(f"\n Smart turn: {label}", file=sys.stderr)
157
+ if is_complete:
158
+ return True
159
+ else:
160
+ log("smart_turn_incomplete", "Continuing to listen...")
161
+ return False
162
+
163
+ def _finalize_utterance(self) -> None:
164
+ """End of turn: final STT, correct typed text, press Enter."""
165
+ if not self._audio_buffer:
166
+ self._reset_listening()
167
+ return
168
+
169
+ audio = np.concatenate(self._audio_buffer)
170
+ duration = len(audio) / 16000
171
+ log("end_of_turn", f"audio_length={duration:.1f}s")
172
+
173
+ if DEBUG:
174
+ print(file=sys.stderr)
175
+
176
+ text = self._stt.transcribe(audio)
177
+
178
+ if self._state == ACTIVATED:
179
+ # Activated mode — full text is the command
180
+ command = text.strip()
181
+ if command:
182
+ if command != self._typed_text:
183
+ if self._typed_text:
184
+ self._update_typed_text(command)
185
+ else:
186
+ keyboard.type_text(command)
187
+ log("dispatch_enter", command)
188
+ if DEBUG:
189
+ print(f" Final: {command} [Enter]", file=sys.stderr)
190
+ keyboard.press_enter()
191
+ else:
192
+ if self._typed_text:
193
+ keyboard.backspace(len(self._typed_text))
194
+ log("dispatch_cancelled", "Empty command in activated mode")
195
+ self._reset_listening()
196
+ return
197
+
198
+ if self._wake_detected:
199
+ # Correct final text and submit
200
+ found, command = check_wake_word(text)
201
+ if found and command:
202
+ if command != self._typed_text:
203
+ self._update_typed_text(command)
204
+ log("dispatch_enter", command)
205
+ if DEBUG:
206
+ print(f" Final: {command} [Enter]", file=sys.stderr)
207
+ keyboard.press_enter()
208
+ elif self._typed_text:
209
+ # Wake word but no command — clear what we typed
210
+ keyboard.backspace(len(self._typed_text))
211
+ log("dispatch_cancelled", "No command after wake word")
212
+ else:
213
+ # Check final transcription for wake word
214
+ found, command = check_wake_word(text)
215
+ if found and command:
216
+ log("wake_word_detected", command)
217
+ if DEBUG:
218
+ print(f" Wake word! Final: {command} [Enter]", file=sys.stderr)
219
+ keyboard.type_text(command)
220
+ keyboard.press_enter()
221
+ elif found:
222
+ # Wake word only — enter ACTIVATED state for next utterance
223
+ log("wake_word_only", "Listening for command...")
224
+ if DEBUG:
225
+ print(" Wake word heard — listening for command...", file=sys.stderr)
226
+ self._audio_buffer.clear()
227
+ self._vad.reset()
228
+ self._wake_detected = False
229
+ self._typed_text = ""
230
+ self._last_partial_at = 0
231
+ self._activated_at = time.monotonic()
232
+ self._state = ACTIVATED
233
+ return # don't reset to IDLE
234
+ else:
235
+ log("no_wake_word", text)
236
+ if DEBUG:
237
+ print(f" No wake word: {text}", file=sys.stderr)
238
+
239
+ self._reset_listening()
240
+
241
+ def _reset_listening(self) -> None:
242
+ self._audio_buffer.clear()
243
+ self._vad.reset()
244
+ self._wake_detected = False
245
+ self._typed_text = ""
246
+ self._last_partial_at = 0
247
+ self._activated_at = 0.0
248
+ self._state = IDLE
249
+
250
+ def run(self) -> None:
251
+ signal.signal(signal.SIGTERM, self._handle_signal)
252
+ signal.signal(signal.SIGINT, self._handle_signal)
253
+
254
+ log("daemon_start", "Starting audio capture...")
255
+ self._mic.start()
256
+
257
+ if DEBUG:
258
+ print("Listening... (Ctrl+C to stop)", file=sys.stderr)
259
+
260
+ sample_count = 0
261
+ trailing_silence = 0
262
+
263
+ try:
264
+ while self._running:
265
+ chunk = self._mic.read(timeout=0.5)
266
+ if chunk is None:
267
+ if self._state == ACTIVATED and not self._audio_buffer:
268
+ if time.monotonic() - self._activated_at > ACTIVATED_TIMEOUT:
269
+ log("activated_timeout", "No command received")
270
+ if DEBUG:
271
+ print("\n Activation timed out", file=sys.stderr)
272
+ self._reset_listening()
273
+ continue
274
+
275
+ prob = self._vad.process(chunk)
276
+
277
+ if DEBUG:
278
+ _print_vad_bar(prob, self._vad.threshold, self._state, chunk)
279
+
280
+ if self._state == IDLE:
281
+ if prob > self._vad.threshold:
282
+ log("speech_start")
283
+ self._state = LISTENING
284
+ self._audio_buffer.append(chunk)
285
+ sample_count = len(chunk)
286
+ trailing_silence = 0
287
+ self._last_partial_at = 0
288
+
289
+ elif self._state == ACTIVATED:
290
+ # Check timeout (only before speech starts)
291
+ if not self._audio_buffer:
292
+ if time.monotonic() - self._activated_at > ACTIVATED_TIMEOUT:
293
+ log("activated_timeout", "No command received")
294
+ if DEBUG:
295
+ print("\n Activation timed out", file=sys.stderr)
296
+ self._reset_listening()
297
+ continue
298
+
299
+ if prob > self._vad.threshold and not self._audio_buffer:
300
+ log("speech_start", "command after activation")
301
+
302
+ if prob > self._vad.threshold or self._audio_buffer:
303
+ self._audio_buffer.append(chunk)
304
+ sample_count += len(chunk)
305
+
306
+ if prob > self._vad.threshold:
307
+ trailing_silence = 0
308
+ else:
309
+ trailing_silence += 1
310
+
311
+ # Periodic STT for incremental typing
312
+ if sample_count - self._last_partial_at >= PARTIAL_STT_INTERVAL:
313
+ self._last_partial_at = sample_count
314
+ self._run_partial_stt()
315
+
316
+ if self._check_segment_end(sample_count, trailing_silence):
317
+ self._finalize_utterance()
318
+ sample_count = 0
319
+ trailing_silence = 0
320
+
321
+ elif self._state == LISTENING:
322
+ self._audio_buffer.append(chunk)
323
+ sample_count += len(chunk)
324
+
325
+ if prob > self._vad.threshold:
326
+ trailing_silence = 0
327
+ else:
328
+ trailing_silence += 1
329
+
330
+ # Periodic STT for incremental typing
331
+ if sample_count - self._last_partial_at >= PARTIAL_STT_INTERVAL:
332
+ self._last_partial_at = sample_count
333
+ self._run_partial_stt()
334
+
335
+ if self._check_segment_end(sample_count, trailing_silence):
336
+ self._finalize_utterance()
337
+ sample_count = 0
338
+ trailing_silence = 0
339
+
340
+ finally:
341
+ if DEBUG:
342
+ print(file=sys.stderr)
343
+ self._mic.stop()
344
+ log("daemon_stop", "Voice daemon stopped")
345
+
346
+
347
+ def main() -> None:
348
+ log("daemon_launch", f"PID={os.getpid()}")
349
+ try:
350
+ daemon = VoiceDaemon()
351
+ daemon.run()
352
+ except Exception as e:
353
+ log("daemon_crash", str(e), level="error")
354
+ if DEBUG:
355
+ import traceback
356
+
357
+ traceback.print_exc(file=sys.stderr)
358
+ sys.exit(1)
359
+
360
+
361
+ if __name__ == "__main__":
362
+ main()
@@ -0,0 +1,26 @@
1
+ """Keyword detection in transcribed text."""
2
+
3
+ import os
4
+
5
+ from logger import log
6
+
7
+ DEFAULT_WAKE_WORDS = ["hi claude"]
8
+
9
+
10
+ def get_wake_words() -> list[str]:
11
+ env = os.environ.get("VOICE_WAKE_WORDS", "")
12
+ if env:
13
+ return [w.strip().lower() for w in env.split(",") if w.strip()]
14
+ return DEFAULT_WAKE_WORDS
15
+
16
+
17
+ def check_wake_word(text: str) -> tuple[bool, str]:
18
+ """Check if text contains a wake word. Returns (found, remaining_text)."""
19
+ lower = text.lower()
20
+ for word in get_wake_words():
21
+ idx = lower.find(word)
22
+ if idx != -1:
23
+ remaining = text[idx + len(word) :].strip().lstrip(",").strip()
24
+ log("wake_word_detected", word, remaining=remaining)
25
+ return True, remaining
26
+ return False, text