@staff0rd/assist 0.78.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -31,6 +31,7 @@ After installation, the `assist` command will be available globally.
31
31
 
32
32
  ## Claude Commands
33
33
 
34
+ - `/comment` - Add pending review comments to the current PR
34
35
  - `/commit` - Commit only relevant files from the session
35
36
  - `/devlog` - Generate devlog entry for the next unversioned day
36
37
  - `/next-backlog-item` - Pick and implement the next backlog item
@@ -42,6 +43,11 @@ After installation, the `assist` command will be available globally.
42
43
  - `/verify` - Run all verification commands in parallel
43
44
  - `/transcript-format` - Format meeting transcripts from VTT files
44
45
  - `/transcript-summarise` - Summarise transcripts missing summaries
46
+ - `/voice-setup` - Download required voice models (VAD, STT)
47
+ - `/voice-start` - Start the voice interaction daemon
48
+ - `/voice-stop` - Stop the voice interaction daemon
49
+ - `/voice-status` - Check voice daemon status
50
+ - `/voice-logs` - Show recent voice daemon logs
45
51
 
46
52
  ## CLI Commands
47
53
 
@@ -54,6 +60,7 @@ After installation, the `assist` command will be available globally.
54
60
  - `assist prs list-comments` - List all comments on the current branch's pull request
55
61
  - `assist prs fixed <comment-id> <sha>` - Reply with commit link and resolve thread
56
62
  - `assist prs wontfix <comment-id> <reason>` - Reply with reason and resolve thread
63
+ - `assist prs comment <path> <line> <body>` - Add a line comment to the pending review
57
64
  - `assist backlog` - Start the backlog web UI (same as `backlog web`)
58
65
  - `assist backlog init` - Create an empty assist.backlog.yml
59
66
  - `assist backlog list [--status <type>] [-v]` - List all backlog items with status icons
@@ -94,4 +101,11 @@ After installation, the `assist` command will be available globally.
94
101
  - `assist transcript configure` - Configure transcript directories
95
102
  - `assist transcript format` - Convert VTT files to formatted markdown transcripts
96
103
  - `assist transcript summarise` - List transcripts that do not have summaries
104
+ - `assist voice setup` - Download required voice models (VAD, STT)
105
+ - `assist voice start` - Start the voice daemon (always-on, listens for wake word)
106
+ - `assist voice start --foreground` - Start in foreground for debugging
107
+ - `assist voice stop` - Stop the voice daemon
108
+ - `assist voice status` - Check voice daemon status and recent events
109
+ - `assist voice devices` - List available audio input devices
110
+ - `assist voice logs [-n <count>]` - Show recent voice daemon log entries
97
111
 
@@ -0,0 +1,39 @@
1
+ ---
2
+ description: Add pending review comments to the current PR
3
+ allowed_args: "<item numbers, e.g. 1,2,3>"
4
+ ---
5
+
6
+ Add pending review comments to the current branch's pull request for the specified items.
7
+
8
+ ## Parsing Arguments
9
+
10
+ Parse `$ARGUMENTS` as a comma-separated list of item numbers (e.g. `1,2` or `1,2,3`). These refer to items in a numbered list from earlier in the conversation.
11
+
12
+ ## Finding the Referenced List
13
+
14
+ Look back through the conversation for the most recent numbered list of issues, suggestions, or comments. Each item should have enough context to determine:
15
+ - **path**: the file path
16
+ - **line**: the line number
17
+ - **body**: a concise comment describing the issue
18
+
19
+ If any referenced item number doesn't exist in the list, report the error and skip it.
20
+
21
+ ## Posting Comments
22
+
23
+ For each referenced item, run:
24
+
25
+ ```
26
+ assist prs comment <path> <line> '<body>' 2>&1
27
+ ```
28
+
29
+ **Important:** Always use single quotes around `<body>`, never double quotes. Double quotes cause shell escaping issues with backticks and special characters.
30
+
31
+ The body must:
32
+ - Be a clear, concise description of the issue (1-2 sentences)
33
+ - Not contain "claude" or "opus" (case-insensitive) — the command will reject it
34
+ - Not contain single quotes (reword to avoid them)
35
+ - Use backticks to wrap inline code or keywords (e.g. `functionName`)
36
+
37
+ ## Report
38
+
39
+ After posting, summarise which comments were added and any that failed.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Show recent voice daemon logs
3
+ ---
4
+
5
+ Run `assist voice logs 2>&1` to show recent voice daemon logs. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Download required voice models (VAD, STT)
3
+ ---
4
+
5
+ Run `assist voice setup 2>&1` to download the required voice models. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Start the voice interaction daemon
3
+ ---
4
+
5
+ Run `assist voice start 2>&1` to start the voice daemon. If it fails, show the error to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Check voice daemon status
3
+ ---
4
+
5
+ Run `assist voice status 2>&1` to check the voice daemon status. Show the result to the user.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description: Stop the voice interaction daemon
3
+ ---
4
+
5
+ Run `assist voice stop 2>&1` to stop the voice daemon. Show the result to the user.
@@ -27,6 +27,7 @@
27
27
  "Bash(assist transcript summarise:*)",
28
28
  "Bash(assist complexity:*)",
29
29
  "Bash(assist transcript format:*)",
30
+ "Bash(assist voice:*)",
30
31
  "Bash(date:*)",
31
32
  "Bash(git add:*)",
32
33
  "Bash(git status:*)",
@@ -48,6 +49,11 @@
48
49
  "SlashCommand(/review-comments)",
49
50
  "SlashCommand(/transcript-format)",
50
51
  "SlashCommand(/transcript-summarise)",
52
+ "SlashCommand(/voice-setup)",
53
+ "SlashCommand(/voice-start)",
54
+ "SlashCommand(/voice-stop)",
55
+ "SlashCommand(/voice-status)",
56
+ "SlashCommand(/voice-logs)",
51
57
  "SlashCommand(/journal)",
52
58
  "SlashCommand(/standup)",
53
59
  "Skill(next-backlog-item)",
@@ -60,6 +66,11 @@
60
66
  "Skill(transcript-summarise)",
61
67
  "Skill(journal)",
62
68
  "Skill(standup)",
69
+ "Skill(voice-setup)",
70
+ "Skill(voice-start)",
71
+ "Skill(voice-stop)",
72
+ "Skill(voice-status)",
73
+ "Skill(voice-logs)",
63
74
  "WebFetch(domain:staffordwilliams.com)"
64
75
  ],
65
76
  "deny": ["Bash(git commit:*)", "Bash(npm run:*)", "Bash(npx assist:*)"]
@@ -0,0 +1,49 @@
1
+ """Microphone capture via sounddevice (16kHz PCM)."""
2
+
3
+ import os
4
+ import queue
5
+ import numpy as np
6
+ import sounddevice as sd
7
+
8
+ from logger import log
9
+
10
+ SAMPLE_RATE = 16000
11
+ BLOCK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
12
+
13
+
14
+ class AudioCapture:
15
+ def __init__(self):
16
+ self._queue: queue.Queue[np.ndarray] = queue.Queue()
17
+ self._stream: sd.InputStream | None = None
18
+ device_name = os.environ.get("VOICE_MIC")
19
+ self._device = device_name if device_name else None
20
+
21
+ def _callback(self, indata: np.ndarray, frames: int, time_info, status) -> None:
22
+ if status:
23
+ log("audio_status", str(status), level="warn")
24
+ self._queue.put(indata[:, 0].copy())
25
+
26
+ def start(self) -> None:
27
+ log("audio_start", f"device={self._device}, rate={SAMPLE_RATE}, block={BLOCK_SIZE}")
28
+ self._stream = sd.InputStream(
29
+ samplerate=SAMPLE_RATE,
30
+ channels=1,
31
+ dtype="float32",
32
+ blocksize=BLOCK_SIZE,
33
+ device=self._device,
34
+ callback=self._callback,
35
+ )
36
+ self._stream.start()
37
+
38
+ def read(self, timeout: float = 1.0) -> np.ndarray | None:
39
+ try:
40
+ return self._queue.get(timeout=timeout)
41
+ except queue.Empty:
42
+ return None
43
+
44
+ def stop(self) -> None:
45
+ if self._stream:
46
+ self._stream.stop()
47
+ self._stream.close()
48
+ self._stream = None
49
+ log("audio_stop")
@@ -0,0 +1,14 @@
1
+ """Dispatch — keyboard-based input into the active terminal."""
2
+
3
+ from logger import log
4
+
5
+ import keyboard
6
+
7
+
8
+ def dispatch(command: str) -> str:
9
+ """Type the command and press Enter."""
10
+ log("dispatch_start", command)
11
+ keyboard.type_text(command)
12
+ keyboard.press_enter()
13
+ log("dispatch_done", command)
14
+ return command
@@ -0,0 +1,73 @@
1
+ """Simulate keyboard input on Windows via SendInput."""
2
+
3
+ import ctypes
4
+ import ctypes.wintypes as w
5
+
6
+ user32 = ctypes.windll.user32
7
+
8
+ INPUT_KEYBOARD = 1
9
+ KEYEVENTF_UNICODE = 0x0004
10
+ KEYEVENTF_KEYUP = 0x0002
11
+ KEYEVENTF_SCANCODE = 0x0008
12
+ VK_RETURN = 0x0D
13
+ VK_BACK = 0x08
14
+ SCAN_RETURN = 0x1C
15
+ SCAN_BACK = 0x0E
16
+
17
+
18
+ class KEYBDINPUT(ctypes.Structure):
19
+ _fields_ = [
20
+ ("wVk", w.WORD),
21
+ ("wScan", w.WORD),
22
+ ("dwFlags", w.DWORD),
23
+ ("time", w.DWORD),
24
+ ("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
25
+ ]
26
+
27
+
28
+ class MOUSEINPUT(ctypes.Structure):
29
+ _fields_ = [
30
+ ("dx", ctypes.c_long),
31
+ ("dy", ctypes.c_long),
32
+ ("mouseData", w.DWORD),
33
+ ("dwFlags", w.DWORD),
34
+ ("time", w.DWORD),
35
+ ("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
36
+ ]
37
+
38
+
39
+ class INPUT(ctypes.Structure):
40
+ class _INPUT(ctypes.Union):
41
+ _fields_ = [("mi", MOUSEINPUT), ("ki", KEYBDINPUT)]
42
+
43
+ _anonymous_ = ("_input",)
44
+ _fields_ = [("type", w.DWORD), ("_input", _INPUT)]
45
+
46
+
47
+ def _send_key(vk: int = 0, scan: int = 0, flags: int = 0) -> None:
48
+ inp = INPUT(type=INPUT_KEYBOARD)
49
+ inp.ki.wVk = vk
50
+ inp.ki.wScan = scan
51
+ inp.ki.dwFlags = flags
52
+ user32.SendInput(1, ctypes.byref(inp), ctypes.sizeof(inp))
53
+
54
+
55
+ def type_text(text: str) -> None:
56
+ """Type a string by sending Unicode keystrokes."""
57
+ for ch in text:
58
+ code = ord(ch)
59
+ _send_key(scan=code, flags=KEYEVENTF_UNICODE)
60
+ _send_key(scan=code, flags=KEYEVENTF_UNICODE | KEYEVENTF_KEYUP)
61
+
62
+
63
+ def backspace(n: int = 1) -> None:
64
+ """Press backspace n times."""
65
+ for _ in range(n):
66
+ _send_key(vk=VK_BACK, scan=SCAN_BACK)
67
+ _send_key(vk=VK_BACK, scan=SCAN_BACK, flags=KEYEVENTF_KEYUP)
68
+
69
+
70
+ def press_enter() -> None:
71
+ """Press the Enter key."""
72
+ _send_key(vk=VK_RETURN, scan=SCAN_RETURN)
73
+ _send_key(vk=VK_RETURN, scan=SCAN_RETURN, flags=KEYEVENTF_KEYUP)
@@ -0,0 +1,20 @@
1
+ """List available audio input devices."""
2
+
3
+ import sounddevice as sd
4
+
5
+
6
+ def main() -> None:
7
+ devices = sd.query_devices()
8
+ print("Audio input devices:\n")
9
+ for i, dev in enumerate(devices):
10
+ if dev["max_input_channels"] > 0:
11
+ default = " (default)" if i == sd.default.device[0] else ""
12
+ print(f" [{i}] {dev['name']}{default}")
13
+ ch = dev["max_input_channels"]
14
+ rate = dev["default_samplerate"]
15
+ print(f" channels={ch}, rate={rate}")
16
+ print()
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
@@ -0,0 +1,38 @@
1
+ """JSON Lines structured logging to voice.log."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ from datetime import datetime, timezone
7
+
8
+
9
+ LOG_FILE = os.environ.get(
10
+ "VOICE_LOG_FILE", os.path.expanduser("~/.assist/voice/voice.log")
11
+ )
12
+
13
+ DEBUG = os.environ.get("VOICE_DEBUG", "") == "1"
14
+
15
+
16
+ def _write(entry: dict) -> None:
17
+ entry["timestamp"] = datetime.now(timezone.utc).isoformat()
18
+ line = json.dumps(entry)
19
+ try:
20
+ with open(LOG_FILE, "a", encoding="utf-8") as f:
21
+ f.write(line + "\n")
22
+ except OSError:
23
+ pass
24
+ if DEBUG:
25
+ ts = entry["timestamp"][11:19]
26
+ level = entry.get("level", "info").upper()
27
+ event = entry.get("event", "")
28
+ msg = entry.get("message", "")
29
+ print(f"{ts} {level:5s} [{event}] {msg}", file=sys.stderr, flush=True)
30
+
31
+
32
+ def log(event: str, message: str = "", *, level: str = "info", **data) -> None:
33
+ entry: dict = {"event": event, "level": level}
34
+ if message:
35
+ entry["message"] = message
36
+ if data:
37
+ entry["data"] = data
38
+ _write(entry)
@@ -0,0 +1,34 @@
1
+ [project]
2
+ name = "assist-voice"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.10"
5
+ dependencies = [
6
+ "onnxruntime>=1.17",
7
+ "sounddevice>=0.4",
8
+ "numpy>=1.24",
9
+ "nemo_toolkit[asr]>=1.22",
10
+ "silero-vad>=5.1",
11
+ ]
12
+
13
+ [project.optional-dependencies]
14
+ dev = [
15
+ "ruff>=0.8",
16
+ ]
17
+
18
+ [tool.setuptools]
19
+ py-modules = [
20
+ "audio_capture",
21
+ "dispatch",
22
+ "logger",
23
+ "smart_turn",
24
+ "stt",
25
+ "vad",
26
+ "voice_daemon",
27
+ "wake_word",
28
+ "setup_models",
29
+ "list_devices",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["setuptools>=68"]
34
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,91 @@
1
+ """Download and prepare all required voice models."""
2
+
3
+ import os
4
+ import sys
5
+
6
+ from logger import log
7
+
8
+
9
+ def get_models_dir() -> str:
10
+ return os.environ.get(
11
+ "VOICE_MODELS_DIR",
12
+ os.path.expanduser("~/.assist/voice/models"),
13
+ )
14
+
15
+
16
+ def setup_silero_vad(models_dir: str) -> None:
17
+ target = os.path.join(models_dir, "silero_vad.onnx")
18
+ if os.path.exists(target):
19
+ print(" silero_vad.onnx already exists")
20
+ return
21
+
22
+ print(" Downloading Silero VAD ONNX model...")
23
+ import urllib.request
24
+
25
+ url = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
26
+ urllib.request.urlretrieve(url, target)
27
+ log("setup_vad", f"Downloaded to {target}")
28
+ print(" silero_vad.onnx downloaded")
29
+
30
+
31
+ def setup_smart_turn(models_dir: str) -> None:
32
+ target = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
33
+ if os.path.exists(target):
34
+ print(" smart-turn-v3.2-cpu.onnx already exists")
35
+ return
36
+
37
+ print(" Downloading Smart Turn ONNX model from HuggingFace...")
38
+ from huggingface_hub import hf_hub_download
39
+
40
+ path = hf_hub_download(
41
+ repo_id="pipecat-ai/smart-turn-v3",
42
+ filename="smart-turn-v3.2-cpu.onnx",
43
+ local_dir=models_dir,
44
+ )
45
+ log("setup_smart_turn", f"Downloaded to {path}")
46
+ print(" smart-turn-v3.2-cpu.onnx downloaded")
47
+
48
+
49
+ def setup_stt(models_dir: str) -> None:
50
+ model_name = os.environ.get("VOICE_MODEL_STT", "nvidia/parakeet-ctc-1.1b")
51
+ print(f" Downloading STT model: {model_name}...")
52
+ print(" (this may take a while on first run)")
53
+
54
+ import nemo.collections.asr as nemo_asr
55
+
56
+ nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
57
+ log("setup_stt", f"Model ready: {model_name}")
58
+ print(f" STT model ready: {model_name}")
59
+
60
+
61
+ def main() -> None:
62
+ models_dir = get_models_dir()
63
+ os.makedirs(models_dir, exist_ok=True)
64
+ print(f"Models directory: {models_dir}\n")
65
+
66
+ print("[1/3] Silero VAD")
67
+ try:
68
+ setup_silero_vad(models_dir)
69
+ except Exception as e:
70
+ log("setup_vad_error", str(e), level="error")
71
+ print(f" ERROR: {e}", file=sys.stderr)
72
+
73
+ print("\n[2/3] Smart Turn (pipecat-ai)")
74
+ try:
75
+ setup_smart_turn(models_dir)
76
+ except Exception as e:
77
+ log("setup_smart_turn_error", str(e), level="error")
78
+ print(f" ERROR: {e}", file=sys.stderr)
79
+
80
+ print("\n[3/3] Parakeet STT (NeMo)")
81
+ try:
82
+ setup_stt(models_dir)
83
+ except Exception as e:
84
+ log("setup_stt_error", str(e), level="error")
85
+ print(f" ERROR: {e}", file=sys.stderr)
86
+
87
+ print("\nSetup complete.")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
@@ -0,0 +1,63 @@
1
+ """Smart Turn end-of-utterance detection (ONNX) via pipecat-ai/smart-turn."""
2
+
3
+ import os
4
+
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+ from transformers import WhisperFeatureExtractor
8
+
9
+ from logger import log
10
+
11
+ END_THRESHOLD = 0.5
12
+ CHUNK_SECONDS = 8
13
+ SAMPLE_RATE = 16000
14
+
15
+
16
+ def _truncate_or_pad(audio: np.ndarray) -> np.ndarray:
17
+ max_samples = CHUNK_SECONDS * SAMPLE_RATE
18
+ if len(audio) > max_samples:
19
+ return audio[-max_samples:]
20
+ if len(audio) < max_samples:
21
+ padding = max_samples - len(audio)
22
+ return np.pad(audio, (padding, 0), mode="constant", constant_values=0)
23
+ return audio
24
+
25
+
26
+ class SmartTurn:
27
+ def __init__(self):
28
+ model_path = os.environ.get("VOICE_MODEL_SMART_TURN")
29
+ if not model_path:
30
+ models_dir = os.environ.get(
31
+ "VOICE_MODELS_DIR",
32
+ os.path.expanduser("~/.assist/voice/models"),
33
+ )
34
+ model_path = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
35
+
36
+ log("smart_turn_init", f"model={model_path}")
37
+ so = ort.SessionOptions()
38
+ so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
39
+ so.inter_op_num_threads = 1
40
+ so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
41
+ self._session = ort.InferenceSession(
42
+ model_path, sess_options=so, providers=["CPUExecutionProvider"]
43
+ )
44
+ self._feature_extractor = WhisperFeatureExtractor(chunk_length=CHUNK_SECONDS)
45
+ self.threshold = END_THRESHOLD
46
+
47
+ def is_end_of_turn(self, audio: np.ndarray) -> bool:
48
+ """Check if the accumulated audio indicates end of utterance."""
49
+ audio = _truncate_or_pad(audio)
50
+ inputs = self._feature_extractor(
51
+ audio,
52
+ sampling_rate=SAMPLE_RATE,
53
+ return_tensors="np",
54
+ padding="max_length",
55
+ max_length=CHUNK_SECONDS * SAMPLE_RATE,
56
+ truncation=True,
57
+ do_normalize=True,
58
+ )
59
+ features = inputs.input_features.squeeze(0).astype(np.float32)
60
+ features = np.expand_dims(features, axis=0)
61
+ outputs = self._session.run(None, {"input_features": features})
62
+ prob = float(outputs[0][0].item())
63
+ return prob > self.threshold
@@ -0,0 +1,51 @@
1
+ """Parakeet NeMo STT wrapper (GPU)."""
2
+
3
+ import os
4
+ import numpy as np
5
+
6
+ from logger import log
7
+
8
+ DEFAULT_MODEL = "nvidia/parakeet-ctc-1.1b"
9
+
10
+
11
+ class ParakeetSTT:
12
+ def __init__(self):
13
+ model_name = os.environ.get("VOICE_MODEL_STT", DEFAULT_MODEL)
14
+ log("stt_init", f"model={model_name}")
15
+
16
+ import nemo.collections.asr as nemo_asr
17
+
18
+ self._model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
19
+ self._model.eval()
20
+ log("stt_ready")
21
+
22
+ def transcribe(self, audio: np.ndarray, sample_rate: int = 16000) -> str:
23
+ """Transcribe audio buffer to text via direct forward pass."""
24
+ import torch
25
+
26
+ audio_tensor = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
27
+ audio_len = torch.tensor([audio.shape[0]], dtype=torch.long)
28
+
29
+ with torch.no_grad():
30
+ logits, logits_len, _ = self._model.forward(
31
+ input_signal=audio_tensor, input_signal_length=audio_len
32
+ )
33
+ # Greedy CTC decode
34
+ preds = torch.argmax(logits, dim=-1)
35
+ text = self._model.decoding.ctc_decoder_predictions_tensor(
36
+ preds, decoder_lengths=logits_len
37
+ )
38
+
39
+ # Result may be nested: tuple of lists of Hypothesis objects
40
+ if isinstance(text, tuple):
41
+ text = text[0]
42
+ if isinstance(text, list):
43
+ text = text[0]
44
+ # NeMo returns Hypothesis namedtuples with a .text field
45
+ if hasattr(text, "text"):
46
+ text = text.text
47
+ if not isinstance(text, str):
48
+ text = str(text)
49
+
50
+ log("stt_result", text)
51
+ return text