@staff0rd/assist 0.79.0 → 0.81.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,6 +43,11 @@ After installation, the `assist` command will be available globally.
43
43
  - `/verify` - Run all verification commands in parallel
44
44
  - `/transcript-format` - Format meeting transcripts from VTT files
45
45
  - `/transcript-summarise` - Summarise transcripts missing summaries
46
+ - `/voice-setup` - Download required voice models (VAD, STT)
47
+ - `/voice-start` - Start the voice interaction daemon
48
+ - `/voice-stop` - Stop the voice interaction daemon
49
+ - `/voice-status` - Check voice daemon status
50
+ - `/voice-logs` - Show recent voice daemon logs
46
51
 
47
52
  ## CLI Commands
48
53
 
@@ -96,4 +101,11 @@ After installation, the `assist` command will be available globally.
96
101
  - `assist transcript configure` - Configure transcript directories
97
102
  - `assist transcript format` - Convert VTT files to formatted markdown transcripts
98
103
  - `assist transcript summarise` - List transcripts that do not have summaries
104
+ - `assist voice setup` - Download required voice models (VAD, STT)
105
+ - `assist voice start` - Start the voice daemon (always-on, listens for wake word)
106
+ - `assist voice start --foreground` - Start in foreground for debugging
107
+ - `assist voice stop` - Stop the voice daemon
108
+ - `assist voice status` - Check voice daemon status and recent events
109
+ - `assist voice devices` - List available audio input devices
110
+ - `assist voice logs [-n <count>]` - Show recent voice daemon log entries
99
111
 
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Show recent voice daemon logs
3
+ ---
4
+
5
+ Run `assist voice logs 2>&1` to show recent voice daemon logs. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Download required voice models (VAD, STT)
3
+ ---
4
+
5
+ Run `assist voice setup 2>&1` to download the required voice models. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Start the voice interaction daemon
3
+ ---
4
+
5
+ Run `assist voice start 2>&1` to start the voice daemon. If it fails, show the error to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Check voice daemon status
3
+ ---
4
+
5
+ Run `assist voice status 2>&1` to check the voice daemon status. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Stop the voice interaction daemon
3
+ ---
4
+
5
+ Run `assist voice stop 2>&1` to stop the voice daemon. Show the result to the user.
@@ -27,6 +27,7 @@
27
27
  "Bash(assist transcript summarise:*)",
28
28
  "Bash(assist complexity:*)",
29
29
  "Bash(assist transcript format:*)",
30
+ "Bash(assist voice:*)",
30
31
  "Bash(date:*)",
31
32
  "Bash(git add:*)",
32
33
  "Bash(git status:*)",
@@ -48,6 +49,11 @@
48
49
  "SlashCommand(/review-comments)",
49
50
  "SlashCommand(/transcript-format)",
50
51
  "SlashCommand(/transcript-summarise)",
52
+ "SlashCommand(/voice-setup)",
53
+ "SlashCommand(/voice-start)",
54
+ "SlashCommand(/voice-stop)",
55
+ "SlashCommand(/voice-status)",
56
+ "SlashCommand(/voice-logs)",
51
57
  "SlashCommand(/journal)",
52
58
  "SlashCommand(/standup)",
53
59
  "Skill(next-backlog-item)",
@@ -60,6 +66,11 @@
60
66
  "Skill(transcript-summarise)",
61
67
  "Skill(journal)",
62
68
  "Skill(standup)",
69
+ "Skill(voice-setup)",
70
+ "Skill(voice-start)",
71
+ "Skill(voice-stop)",
72
+ "Skill(voice-status)",
73
+ "Skill(voice-logs)",
63
74
  "WebFetch(domain:staffordwilliams.com)"
64
75
  ],
65
76
  "deny": ["Bash(git commit:*)", "Bash(npm run:*)", "Bash(npx assist:*)"]
@@ -0,0 +1,49 @@
1
+ """Microphone capture via sounddevice (16kHz PCM)."""
2
+
3
+ import os
4
+ import queue
5
+ import numpy as np
6
+ import sounddevice as sd
7
+
8
+ from logger import log
9
+
10
+ SAMPLE_RATE = 16000
11
+ BLOCK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
12
+
13
+
14
+ class AudioCapture:
15
+ def __init__(self):
16
+ self._queue: queue.Queue[np.ndarray] = queue.Queue()
17
+ self._stream: sd.InputStream | None = None
18
+ device_name = os.environ.get("VOICE_MIC")
19
+ self._device = device_name if device_name else None
20
+
21
+ def _callback(self, indata: np.ndarray, frames: int, time_info, status) -> None:
22
+ if status:
23
+ log("audio_status", str(status), level="warn")
24
+ self._queue.put(indata[:, 0].copy())
25
+
26
+ def start(self) -> None:
27
+ log("audio_start", f"device={self._device}, rate={SAMPLE_RATE}, block={BLOCK_SIZE}")
28
+ self._stream = sd.InputStream(
29
+ samplerate=SAMPLE_RATE,
30
+ channels=1,
31
+ dtype="float32",
32
+ blocksize=BLOCK_SIZE,
33
+ device=self._device,
34
+ callback=self._callback,
35
+ )
36
+ self._stream.start()
37
+
38
+ def read(self, timeout: float = 1.0) -> np.ndarray | None:
39
+ try:
40
+ return self._queue.get(timeout=timeout)
41
+ except queue.Empty:
42
+ return None
43
+
44
+ def stop(self) -> None:
45
+ if self._stream:
46
+ self._stream.stop()
47
+ self._stream.close()
48
+ self._stream = None
49
+ log("audio_stop")
@@ -0,0 +1,14 @@
1
+ """Dispatch — keyboard-based input into the active terminal."""
2
+
3
+ from logger import log
4
+
5
+ import keyboard
6
+
7
+
8
+ def dispatch(command: str) -> str:
9
+ """Type the command and press Enter."""
10
+ log("dispatch_start", command)
11
+ keyboard.type_text(command)
12
+ keyboard.press_enter()
13
+ log("dispatch_done", command)
14
+ return command
@@ -0,0 +1,73 @@
1
+ """Simulate keyboard input on Windows via SendInput."""
2
+
3
+ import ctypes
4
+ import ctypes.wintypes as w
5
+
6
+ user32 = ctypes.windll.user32
7
+
8
+ INPUT_KEYBOARD = 1
9
+ KEYEVENTF_UNICODE = 0x0004
10
+ KEYEVENTF_KEYUP = 0x0002
11
+ KEYEVENTF_SCANCODE = 0x0008
12
+ VK_RETURN = 0x0D
13
+ VK_BACK = 0x08
14
+ SCAN_RETURN = 0x1C
15
+ SCAN_BACK = 0x0E
16
+
17
+
18
+ class KEYBDINPUT(ctypes.Structure):
19
+ _fields_ = [
20
+ ("wVk", w.WORD),
21
+ ("wScan", w.WORD),
22
+ ("dwFlags", w.DWORD),
23
+ ("time", w.DWORD),
24
+ ("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
25
+ ]
26
+
27
+
28
+ class MOUSEINPUT(ctypes.Structure):
29
+ _fields_ = [
30
+ ("dx", ctypes.c_long),
31
+ ("dy", ctypes.c_long),
32
+ ("mouseData", w.DWORD),
33
+ ("dwFlags", w.DWORD),
34
+ ("time", w.DWORD),
35
+ ("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
36
+ ]
37
+
38
+
39
+ class INPUT(ctypes.Structure):
40
+ class _INPUT(ctypes.Union):
41
+ _fields_ = [("mi", MOUSEINPUT), ("ki", KEYBDINPUT)]
42
+
43
+ _anonymous_ = ("_input",)
44
+ _fields_ = [("type", w.DWORD), ("_input", _INPUT)]
45
+
46
+
47
+ def _send_key(vk: int = 0, scan: int = 0, flags: int = 0) -> None:
48
+ inp = INPUT(type=INPUT_KEYBOARD)
49
+ inp.ki.wVk = vk
50
+ inp.ki.wScan = scan
51
+ inp.ki.dwFlags = flags
52
+ user32.SendInput(1, ctypes.byref(inp), ctypes.sizeof(inp))
53
+
54
+
55
+ def type_text(text: str) -> None:
56
+ """Type a string by sending Unicode keystrokes."""
57
+ for ch in text:
58
+ code = ord(ch)
59
+ _send_key(scan=code, flags=KEYEVENTF_UNICODE)
60
+ _send_key(scan=code, flags=KEYEVENTF_UNICODE | KEYEVENTF_KEYUP)
61
+
62
+
63
+ def backspace(n: int = 1) -> None:
64
+ """Press backspace n times."""
65
+ for _ in range(n):
66
+ _send_key(vk=VK_BACK, scan=SCAN_BACK)
67
+ _send_key(vk=VK_BACK, scan=SCAN_BACK, flags=KEYEVENTF_KEYUP)
68
+
69
+
70
+ def press_enter() -> None:
71
+ """Press the Enter key."""
72
+ _send_key(vk=VK_RETURN, scan=SCAN_RETURN)
73
+ _send_key(vk=VK_RETURN, scan=SCAN_RETURN, flags=KEYEVENTF_KEYUP)
@@ -0,0 +1,20 @@
1
+ """List available audio input devices."""
2
+
3
+ import sounddevice as sd
4
+
5
+
6
+ def main() -> None:
7
+ devices = sd.query_devices()
8
+ print("Audio input devices:\n")
9
+ for i, dev in enumerate(devices):
10
+ if dev["max_input_channels"] > 0:
11
+ default = " (default)" if i == sd.default.device[0] else ""
12
+ print(f" [{i}] {dev['name']}{default}")
13
+ ch = dev["max_input_channels"]
14
+ rate = dev["default_samplerate"]
15
+ print(f" channels={ch}, rate={rate}")
16
+ print()
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
@@ -0,0 +1,38 @@
1
+ """JSON Lines structured logging to voice.log."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ from datetime import datetime, timezone
7
+
8
+
9
+ LOG_FILE = os.environ.get(
10
+ "VOICE_LOG_FILE", os.path.expanduser("~/.assist/voice/voice.log")
11
+ )
12
+
13
+ DEBUG = os.environ.get("VOICE_DEBUG", "") == "1"
14
+
15
+
16
+ def _write(entry: dict) -> None:
17
+ entry["timestamp"] = datetime.now(timezone.utc).isoformat()
18
+ line = json.dumps(entry)
19
+ try:
20
+ with open(LOG_FILE, "a", encoding="utf-8") as f:
21
+ f.write(line + "\n")
22
+ except OSError:
23
+ pass
24
+ if DEBUG:
25
+ ts = entry["timestamp"][11:19]
26
+ level = entry.get("level", "info").upper()
27
+ event = entry.get("event", "")
28
+ msg = entry.get("message", "")
29
+ print(f"{ts} {level:5s} [{event}] {msg}", file=sys.stderr, flush=True)
30
+
31
+
32
+ def log(event: str, message: str = "", *, level: str = "info", **data) -> None:
33
+ entry: dict = {"event": event, "level": level}
34
+ if message:
35
+ entry["message"] = message
36
+ if data:
37
+ entry["data"] = data
38
+ _write(entry)
@@ -0,0 +1,34 @@
1
+ [project]
2
+ name = "assist-voice"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.10"
5
+ dependencies = [
6
+ "onnxruntime>=1.17",
7
+ "sounddevice>=0.4",
8
+ "numpy>=1.24",
9
+ "nemo_toolkit[asr]>=1.22",
10
+ "silero-vad>=5.1",
11
+ ]
12
+
13
+ [project.optional-dependencies]
14
+ dev = [
15
+ "ruff>=0.8",
16
+ ]
17
+
18
+ [tool.setuptools]
19
+ py-modules = [
20
+ "audio_capture",
21
+ "dispatch",
22
+ "logger",
23
+ "smart_turn",
24
+ "stt",
25
+ "vad",
26
+ "voice_daemon",
27
+ "wake_word",
28
+ "setup_models",
29
+ "list_devices",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["setuptools>=68"]
34
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,91 @@
1
+ """Download and prepare all required voice models."""
2
+
3
+ import os
4
+ import sys
5
+
6
+ from logger import log
7
+
8
+
9
+ def get_models_dir() -> str:
10
+ return os.environ.get(
11
+ "VOICE_MODELS_DIR",
12
+ os.path.expanduser("~/.assist/voice/models"),
13
+ )
14
+
15
+
16
+ def setup_silero_vad(models_dir: str) -> None:
17
+ target = os.path.join(models_dir, "silero_vad.onnx")
18
+ if os.path.exists(target):
19
+ print(" silero_vad.onnx already exists")
20
+ return
21
+
22
+ print(" Downloading Silero VAD ONNX model...")
23
+ import urllib.request
24
+
25
+ url = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
26
+ urllib.request.urlretrieve(url, target)
27
+ log("setup_vad", f"Downloaded to {target}")
28
+ print(" silero_vad.onnx downloaded")
29
+
30
+
31
+ def setup_smart_turn(models_dir: str) -> None:
32
+ target = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
33
+ if os.path.exists(target):
34
+ print(" smart-turn-v3.2-cpu.onnx already exists")
35
+ return
36
+
37
+ print(" Downloading Smart Turn ONNX model from HuggingFace...")
38
+ from huggingface_hub import hf_hub_download
39
+
40
+ path = hf_hub_download(
41
+ repo_id="pipecat-ai/smart-turn-v3",
42
+ filename="smart-turn-v3.2-cpu.onnx",
43
+ local_dir=models_dir,
44
+ )
45
+ log("setup_smart_turn", f"Downloaded to {path}")
46
+ print(" smart-turn-v3.2-cpu.onnx downloaded")
47
+
48
+
49
+ def setup_stt(models_dir: str) -> None:
50
+ model_name = os.environ.get("VOICE_MODEL_STT", "nvidia/parakeet-ctc-1.1b")
51
+ print(f" Downloading STT model: {model_name}...")
52
+ print(" (this may take a while on first run)")
53
+
54
+ import nemo.collections.asr as nemo_asr
55
+
56
+ nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
57
+ log("setup_stt", f"Model ready: {model_name}")
58
+ print(f" STT model ready: {model_name}")
59
+
60
+
61
+ def main() -> None:
62
+ models_dir = get_models_dir()
63
+ os.makedirs(models_dir, exist_ok=True)
64
+ print(f"Models directory: {models_dir}\n")
65
+
66
+ print("[1/3] Silero VAD")
67
+ try:
68
+ setup_silero_vad(models_dir)
69
+ except Exception as e:
70
+ log("setup_vad_error", str(e), level="error")
71
+ print(f" ERROR: {e}", file=sys.stderr)
72
+
73
+ print("\n[2/3] Smart Turn (pipecat-ai)")
74
+ try:
75
+ setup_smart_turn(models_dir)
76
+ except Exception as e:
77
+ log("setup_smart_turn_error", str(e), level="error")
78
+ print(f" ERROR: {e}", file=sys.stderr)
79
+
80
+ print("\n[3/3] Parakeet STT (NeMo)")
81
+ try:
82
+ setup_stt(models_dir)
83
+ except Exception as e:
84
+ log("setup_stt_error", str(e), level="error")
85
+ print(f" ERROR: {e}", file=sys.stderr)
86
+
87
+ print("\nSetup complete.")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
@@ -0,0 +1,63 @@
1
+ """Smart Turn end-of-utterance detection (ONNX) via pipecat-ai/smart-turn."""
2
+
3
+ import os
4
+
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+ from transformers import WhisperFeatureExtractor
8
+
9
+ from logger import log
10
+
11
+ END_THRESHOLD = 0.5
12
+ CHUNK_SECONDS = 8
13
+ SAMPLE_RATE = 16000
14
+
15
+
16
+ def _truncate_or_pad(audio: np.ndarray) -> np.ndarray:
17
+ max_samples = CHUNK_SECONDS * SAMPLE_RATE
18
+ if len(audio) > max_samples:
19
+ return audio[-max_samples:]
20
+ if len(audio) < max_samples:
21
+ padding = max_samples - len(audio)
22
+ return np.pad(audio, (padding, 0), mode="constant", constant_values=0)
23
+ return audio
24
+
25
+
26
+ class SmartTurn:
27
+ def __init__(self):
28
+ model_path = os.environ.get("VOICE_MODEL_SMART_TURN")
29
+ if not model_path:
30
+ models_dir = os.environ.get(
31
+ "VOICE_MODELS_DIR",
32
+ os.path.expanduser("~/.assist/voice/models"),
33
+ )
34
+ model_path = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
35
+
36
+ log("smart_turn_init", f"model={model_path}")
37
+ so = ort.SessionOptions()
38
+ so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
39
+ so.inter_op_num_threads = 1
40
+ so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
41
+ self._session = ort.InferenceSession(
42
+ model_path, sess_options=so, providers=["CPUExecutionProvider"]
43
+ )
44
+ self._feature_extractor = WhisperFeatureExtractor(chunk_length=CHUNK_SECONDS)
45
+ self.threshold = END_THRESHOLD
46
+
47
+ def is_end_of_turn(self, audio: np.ndarray) -> bool:
48
+ """Check if the accumulated audio indicates end of utterance."""
49
+ audio = _truncate_or_pad(audio)
50
+ inputs = self._feature_extractor(
51
+ audio,
52
+ sampling_rate=SAMPLE_RATE,
53
+ return_tensors="np",
54
+ padding="max_length",
55
+ max_length=CHUNK_SECONDS * SAMPLE_RATE,
56
+ truncation=True,
57
+ do_normalize=True,
58
+ )
59
+ features = inputs.input_features.squeeze(0).astype(np.float32)
60
+ features = np.expand_dims(features, axis=0)
61
+ outputs = self._session.run(None, {"input_features": features})
62
+ prob = float(outputs[0][0].item())
63
+ return prob > self.threshold
@@ -0,0 +1,51 @@
1
+ """Parakeet NeMo STT wrapper (GPU)."""
2
+
3
+ import os
4
+ import numpy as np
5
+
6
+ from logger import log
7
+
8
+ DEFAULT_MODEL = "nvidia/parakeet-ctc-1.1b"
9
+
10
+
11
+ class ParakeetSTT:
12
+ def __init__(self):
13
+ model_name = os.environ.get("VOICE_MODEL_STT", DEFAULT_MODEL)
14
+ log("stt_init", f"model={model_name}")
15
+
16
+ import nemo.collections.asr as nemo_asr
17
+
18
+ self._model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
19
+ self._model.eval()
20
+ log("stt_ready")
21
+
22
+ def transcribe(self, audio: np.ndarray, sample_rate: int = 16000) -> str:
23
+ """Transcribe audio buffer to text via direct forward pass."""
24
+ import torch
25
+
26
+ audio_tensor = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
27
+ audio_len = torch.tensor([audio.shape[0]], dtype=torch.long)
28
+
29
+ with torch.no_grad():
30
+ logits, logits_len, _ = self._model.forward(
31
+ input_signal=audio_tensor, input_signal_length=audio_len
32
+ )
33
+ # Greedy CTC decode
34
+ preds = torch.argmax(logits, dim=-1)
35
+ text = self._model.decoding.ctc_decoder_predictions_tensor(
36
+ preds, decoder_lengths=logits_len
37
+ )
38
+
39
+ # Result may be nested: tuple of lists of Hypothesis objects
40
+ if isinstance(text, tuple):
41
+ text = text[0]
42
+ if isinstance(text, list):
43
+ text = text[0]
44
+ # NeMo returns Hypothesis namedtuples with a .text field
45
+ if hasattr(text, "text"):
46
+ text = text.text
47
+ if not isinstance(text, str):
48
+ text = str(text)
49
+
50
+ log("stt_result", text)
51
+ return text