@staff0rd/assist 0.78.0 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -0
- package/claude/commands/comment.md +39 -0
- package/claude/commands/voice-logs.md +5 -0
- package/claude/commands/voice-setup.md +5 -0
- package/claude/commands/voice-start.md +5 -0
- package/claude/commands/voice-status.md +5 -0
- package/claude/commands/voice-stop.md +5 -0
- package/claude/settings.json +11 -0
- package/dist/commands/voice/python/audio_capture.py +49 -0
- package/dist/commands/voice/python/dispatch.py +14 -0
- package/dist/commands/voice/python/keyboard.py +73 -0
- package/dist/commands/voice/python/list_devices.py +20 -0
- package/dist/commands/voice/python/logger.py +38 -0
- package/dist/commands/voice/python/pyproject.toml +34 -0
- package/dist/commands/voice/python/setup_models.py +91 -0
- package/dist/commands/voice/python/smart_turn.py +63 -0
- package/dist/commands/voice/python/stt.py +51 -0
- package/dist/commands/voice/python/uv.lock +5947 -0
- package/dist/commands/voice/python/vad.py +50 -0
- package/dist/commands/voice/python/voice_daemon.py +362 -0
- package/dist/commands/voice/python/wake_word.py +26 -0
- package/dist/index.js +562 -179
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -31,6 +31,7 @@ After installation, the `assist` command will be available globally.
|
|
|
31
31
|
|
|
32
32
|
## Claude Commands
|
|
33
33
|
|
|
34
|
+
- `/comment` - Add pending review comments to the current PR
|
|
34
35
|
- `/commit` - Commit only relevant files from the session
|
|
35
36
|
- `/devlog` - Generate devlog entry for the next unversioned day
|
|
36
37
|
- `/next-backlog-item` - Pick and implement the next backlog item
|
|
@@ -42,6 +43,11 @@ After installation, the `assist` command will be available globally.
|
|
|
42
43
|
- `/verify` - Run all verification commands in parallel
|
|
43
44
|
- `/transcript-format` - Format meeting transcripts from VTT files
|
|
44
45
|
- `/transcript-summarise` - Summarise transcripts missing summaries
|
|
46
|
+
- `/voice-setup` - Download required voice models (VAD, STT)
|
|
47
|
+
- `/voice-start` - Start the voice interaction daemon
|
|
48
|
+
- `/voice-stop` - Stop the voice interaction daemon
|
|
49
|
+
- `/voice-status` - Check voice daemon status
|
|
50
|
+
- `/voice-logs` - Show recent voice daemon logs
|
|
45
51
|
|
|
46
52
|
## CLI Commands
|
|
47
53
|
|
|
@@ -54,6 +60,7 @@ After installation, the `assist` command will be available globally.
|
|
|
54
60
|
- `assist prs list-comments` - List all comments on the current branch's pull request
|
|
55
61
|
- `assist prs fixed <comment-id> <sha>` - Reply with commit link and resolve thread
|
|
56
62
|
- `assist prs wontfix <comment-id> <reason>` - Reply with reason and resolve thread
|
|
63
|
+
- `assist prs comment <path> <line> <body>` - Add a line comment to the pending review
|
|
57
64
|
- `assist backlog` - Start the backlog web UI (same as `backlog web`)
|
|
58
65
|
- `assist backlog init` - Create an empty assist.backlog.yml
|
|
59
66
|
- `assist backlog list [--status <type>] [-v]` - List all backlog items with status icons
|
|
@@ -94,4 +101,11 @@ After installation, the `assist` command will be available globally.
|
|
|
94
101
|
- `assist transcript configure` - Configure transcript directories
|
|
95
102
|
- `assist transcript format` - Convert VTT files to formatted markdown transcripts
|
|
96
103
|
- `assist transcript summarise` - List transcripts that do not have summaries
|
|
104
|
+
- `assist voice setup` - Download required voice models (VAD, STT)
|
|
105
|
+
- `assist voice start` - Start the voice daemon (always-on, listens for wake word)
|
|
106
|
+
- `assist voice start --foreground` - Start in foreground for debugging
|
|
107
|
+
- `assist voice stop` - Stop the voice daemon
|
|
108
|
+
- `assist voice status` - Check voice daemon status and recent events
|
|
109
|
+
- `assist voice devices` - List available audio input devices
|
|
110
|
+
- `assist voice logs [-n <count>]` - Show recent voice daemon log entries
|
|
97
111
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Add pending review comments to the current PR
|
|
3
|
+
allowed_args: "<item numbers, e.g. 1,2,3>"
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Add pending review comments to the current branch's pull request for the specified items.
|
|
7
|
+
|
|
8
|
+
## Parsing Arguments
|
|
9
|
+
|
|
10
|
+
Parse `$ARGUMENTS` as a comma-separated list of item numbers (e.g. `1,2` or `1,2,3`). These refer to items in a numbered list from earlier in the conversation.
|
|
11
|
+
|
|
12
|
+
## Finding the Referenced List
|
|
13
|
+
|
|
14
|
+
Look back through the conversation for the most recent numbered list of issues, suggestions, or comments. Each item should have enough context to determine:
|
|
15
|
+
- **path**: the file path
|
|
16
|
+
- **line**: the line number
|
|
17
|
+
- **body**: a concise comment describing the issue
|
|
18
|
+
|
|
19
|
+
If any referenced item number doesn't exist in the list, report the error and skip it.
|
|
20
|
+
|
|
21
|
+
## Posting Comments
|
|
22
|
+
|
|
23
|
+
For each referenced item, run:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
assist prs comment <path> <line> '<body>' 2>&1
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Important:** Always use single quotes around `<body>`, never double quotes. Double quotes cause shell escaping issues with backticks and special characters.
|
|
30
|
+
|
|
31
|
+
The body must:
|
|
32
|
+
- Be a clear, concise description of the issue (1-2 sentences)
|
|
33
|
+
- Not contain "claude" or "opus" (case-insensitive) — the command will reject it
|
|
34
|
+
- Not contain single quotes (reword to avoid them)
|
|
35
|
+
- Use backticks to wrap inline code or keywords (e.g. `functionName`)
|
|
36
|
+
|
|
37
|
+
## Report
|
|
38
|
+
|
|
39
|
+
After posting, summarise which comments were added and any that failed.
|
package/claude/settings.json
CHANGED
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
"Bash(assist transcript summarise:*)",
|
|
28
28
|
"Bash(assist complexity:*)",
|
|
29
29
|
"Bash(assist transcript format:*)",
|
|
30
|
+
"Bash(assist voice:*)",
|
|
30
31
|
"Bash(date:*)",
|
|
31
32
|
"Bash(git add:*)",
|
|
32
33
|
"Bash(git status:*)",
|
|
@@ -48,6 +49,11 @@
|
|
|
48
49
|
"SlashCommand(/review-comments)",
|
|
49
50
|
"SlashCommand(/transcript-format)",
|
|
50
51
|
"SlashCommand(/transcript-summarise)",
|
|
52
|
+
"SlashCommand(/voice-setup)",
|
|
53
|
+
"SlashCommand(/voice-start)",
|
|
54
|
+
"SlashCommand(/voice-stop)",
|
|
55
|
+
"SlashCommand(/voice-status)",
|
|
56
|
+
"SlashCommand(/voice-logs)",
|
|
51
57
|
"SlashCommand(/journal)",
|
|
52
58
|
"SlashCommand(/standup)",
|
|
53
59
|
"Skill(next-backlog-item)",
|
|
@@ -60,6 +66,11 @@
|
|
|
60
66
|
"Skill(transcript-summarise)",
|
|
61
67
|
"Skill(journal)",
|
|
62
68
|
"Skill(standup)",
|
|
69
|
+
"Skill(voice-setup)",
|
|
70
|
+
"Skill(voice-start)",
|
|
71
|
+
"Skill(voice-stop)",
|
|
72
|
+
"Skill(voice-status)",
|
|
73
|
+
"Skill(voice-logs)",
|
|
63
74
|
"WebFetch(domain:staffordwilliams.com)"
|
|
64
75
|
],
|
|
65
76
|
"deny": ["Bash(git commit:*)", "Bash(npm run:*)", "Bash(npx assist:*)"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Microphone capture via sounddevice (16kHz PCM)."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import queue
|
|
5
|
+
import numpy as np
|
|
6
|
+
import sounddevice as sd
|
|
7
|
+
|
|
8
|
+
from logger import log
|
|
9
|
+
|
|
10
|
+
SAMPLE_RATE = 16000
|
|
11
|
+
BLOCK_SIZE = 512 # Silero VAD requires exactly 512 samples at 16kHz
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AudioCapture:
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self._queue: queue.Queue[np.ndarray] = queue.Queue()
|
|
17
|
+
self._stream: sd.InputStream | None = None
|
|
18
|
+
device_name = os.environ.get("VOICE_MIC")
|
|
19
|
+
self._device = device_name if device_name else None
|
|
20
|
+
|
|
21
|
+
def _callback(self, indata: np.ndarray, frames: int, time_info, status) -> None:
|
|
22
|
+
if status:
|
|
23
|
+
log("audio_status", str(status), level="warn")
|
|
24
|
+
self._queue.put(indata[:, 0].copy())
|
|
25
|
+
|
|
26
|
+
def start(self) -> None:
|
|
27
|
+
log("audio_start", f"device={self._device}, rate={SAMPLE_RATE}, block={BLOCK_SIZE}")
|
|
28
|
+
self._stream = sd.InputStream(
|
|
29
|
+
samplerate=SAMPLE_RATE,
|
|
30
|
+
channels=1,
|
|
31
|
+
dtype="float32",
|
|
32
|
+
blocksize=BLOCK_SIZE,
|
|
33
|
+
device=self._device,
|
|
34
|
+
callback=self._callback,
|
|
35
|
+
)
|
|
36
|
+
self._stream.start()
|
|
37
|
+
|
|
38
|
+
def read(self, timeout: float = 1.0) -> np.ndarray | None:
|
|
39
|
+
try:
|
|
40
|
+
return self._queue.get(timeout=timeout)
|
|
41
|
+
except queue.Empty:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
def stop(self) -> None:
|
|
45
|
+
if self._stream:
|
|
46
|
+
self._stream.stop()
|
|
47
|
+
self._stream.close()
|
|
48
|
+
self._stream = None
|
|
49
|
+
log("audio_stop")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Dispatch — keyboard-based input into the active terminal."""
|
|
2
|
+
|
|
3
|
+
from logger import log
|
|
4
|
+
|
|
5
|
+
import keyboard
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def dispatch(command: str) -> str:
|
|
9
|
+
"""Type the command and press Enter."""
|
|
10
|
+
log("dispatch_start", command)
|
|
11
|
+
keyboard.type_text(command)
|
|
12
|
+
keyboard.press_enter()
|
|
13
|
+
log("dispatch_done", command)
|
|
14
|
+
return command
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Simulate keyboard input on Windows via SendInput."""
|
|
2
|
+
|
|
3
|
+
import ctypes
|
|
4
|
+
import ctypes.wintypes as w
|
|
5
|
+
|
|
6
|
+
user32 = ctypes.windll.user32
|
|
7
|
+
|
|
8
|
+
INPUT_KEYBOARD = 1
|
|
9
|
+
KEYEVENTF_UNICODE = 0x0004
|
|
10
|
+
KEYEVENTF_KEYUP = 0x0002
|
|
11
|
+
KEYEVENTF_SCANCODE = 0x0008
|
|
12
|
+
VK_RETURN = 0x0D
|
|
13
|
+
VK_BACK = 0x08
|
|
14
|
+
SCAN_RETURN = 0x1C
|
|
15
|
+
SCAN_BACK = 0x0E
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class KEYBDINPUT(ctypes.Structure):
|
|
19
|
+
_fields_ = [
|
|
20
|
+
("wVk", w.WORD),
|
|
21
|
+
("wScan", w.WORD),
|
|
22
|
+
("dwFlags", w.DWORD),
|
|
23
|
+
("time", w.DWORD),
|
|
24
|
+
("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MOUSEINPUT(ctypes.Structure):
|
|
29
|
+
_fields_ = [
|
|
30
|
+
("dx", ctypes.c_long),
|
|
31
|
+
("dy", ctypes.c_long),
|
|
32
|
+
("mouseData", w.DWORD),
|
|
33
|
+
("dwFlags", w.DWORD),
|
|
34
|
+
("time", w.DWORD),
|
|
35
|
+
("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong)),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class INPUT(ctypes.Structure):
|
|
40
|
+
class _INPUT(ctypes.Union):
|
|
41
|
+
_fields_ = [("mi", MOUSEINPUT), ("ki", KEYBDINPUT)]
|
|
42
|
+
|
|
43
|
+
_anonymous_ = ("_input",)
|
|
44
|
+
_fields_ = [("type", w.DWORD), ("_input", _INPUT)]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _send_key(vk: int = 0, scan: int = 0, flags: int = 0) -> None:
|
|
48
|
+
inp = INPUT(type=INPUT_KEYBOARD)
|
|
49
|
+
inp.ki.wVk = vk
|
|
50
|
+
inp.ki.wScan = scan
|
|
51
|
+
inp.ki.dwFlags = flags
|
|
52
|
+
user32.SendInput(1, ctypes.byref(inp), ctypes.sizeof(inp))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def type_text(text: str) -> None:
|
|
56
|
+
"""Type a string by sending Unicode keystrokes."""
|
|
57
|
+
for ch in text:
|
|
58
|
+
code = ord(ch)
|
|
59
|
+
_send_key(scan=code, flags=KEYEVENTF_UNICODE)
|
|
60
|
+
_send_key(scan=code, flags=KEYEVENTF_UNICODE | KEYEVENTF_KEYUP)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def backspace(n: int = 1) -> None:
|
|
64
|
+
"""Press backspace n times."""
|
|
65
|
+
for _ in range(n):
|
|
66
|
+
_send_key(vk=VK_BACK, scan=SCAN_BACK)
|
|
67
|
+
_send_key(vk=VK_BACK, scan=SCAN_BACK, flags=KEYEVENTF_KEYUP)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def press_enter() -> None:
|
|
71
|
+
"""Press the Enter key."""
|
|
72
|
+
_send_key(vk=VK_RETURN, scan=SCAN_RETURN)
|
|
73
|
+
_send_key(vk=VK_RETURN, scan=SCAN_RETURN, flags=KEYEVENTF_KEYUP)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""List available audio input devices."""
|
|
2
|
+
|
|
3
|
+
import sounddevice as sd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main() -> None:
|
|
7
|
+
devices = sd.query_devices()
|
|
8
|
+
print("Audio input devices:\n")
|
|
9
|
+
for i, dev in enumerate(devices):
|
|
10
|
+
if dev["max_input_channels"] > 0:
|
|
11
|
+
default = " (default)" if i == sd.default.device[0] else ""
|
|
12
|
+
print(f" [{i}] {dev['name']}{default}")
|
|
13
|
+
ch = dev["max_input_channels"]
|
|
14
|
+
rate = dev["default_samplerate"]
|
|
15
|
+
print(f" channels={ch}, rate={rate}")
|
|
16
|
+
print()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
main()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""JSON Lines structured logging to voice.log."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
LOG_FILE = os.environ.get(
|
|
10
|
+
"VOICE_LOG_FILE", os.path.expanduser("~/.assist/voice/voice.log")
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
DEBUG = os.environ.get("VOICE_DEBUG", "") == "1"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _write(entry: dict) -> None:
|
|
17
|
+
entry["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
18
|
+
line = json.dumps(entry)
|
|
19
|
+
try:
|
|
20
|
+
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
21
|
+
f.write(line + "\n")
|
|
22
|
+
except OSError:
|
|
23
|
+
pass
|
|
24
|
+
if DEBUG:
|
|
25
|
+
ts = entry["timestamp"][11:19]
|
|
26
|
+
level = entry.get("level", "info").upper()
|
|
27
|
+
event = entry.get("event", "")
|
|
28
|
+
msg = entry.get("message", "")
|
|
29
|
+
print(f"{ts} {level:5s} [{event}] {msg}", file=sys.stderr, flush=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def log(event: str, message: str = "", *, level: str = "info", **data) -> None:
|
|
33
|
+
entry: dict = {"event": event, "level": level}
|
|
34
|
+
if message:
|
|
35
|
+
entry["message"] = message
|
|
36
|
+
if data:
|
|
37
|
+
entry["data"] = data
|
|
38
|
+
_write(entry)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "assist-voice"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
requires-python = ">=3.10"
|
|
5
|
+
dependencies = [
|
|
6
|
+
"onnxruntime>=1.17",
|
|
7
|
+
"sounddevice>=0.4",
|
|
8
|
+
"numpy>=1.24",
|
|
9
|
+
"nemo_toolkit[asr]>=1.22",
|
|
10
|
+
"silero-vad>=5.1",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[project.optional-dependencies]
|
|
14
|
+
dev = [
|
|
15
|
+
"ruff>=0.8",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.setuptools]
|
|
19
|
+
py-modules = [
|
|
20
|
+
"audio_capture",
|
|
21
|
+
"dispatch",
|
|
22
|
+
"logger",
|
|
23
|
+
"smart_turn",
|
|
24
|
+
"stt",
|
|
25
|
+
"vad",
|
|
26
|
+
"voice_daemon",
|
|
27
|
+
"wake_word",
|
|
28
|
+
"setup_models",
|
|
29
|
+
"list_devices",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["setuptools>=68"]
|
|
34
|
+
build-backend = "setuptools.build_meta"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Download and prepare all required voice models."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from logger import log
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_models_dir() -> str:
|
|
10
|
+
return os.environ.get(
|
|
11
|
+
"VOICE_MODELS_DIR",
|
|
12
|
+
os.path.expanduser("~/.assist/voice/models"),
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def setup_silero_vad(models_dir: str) -> None:
|
|
17
|
+
target = os.path.join(models_dir, "silero_vad.onnx")
|
|
18
|
+
if os.path.exists(target):
|
|
19
|
+
print(" silero_vad.onnx already exists")
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
print(" Downloading Silero VAD ONNX model...")
|
|
23
|
+
import urllib.request
|
|
24
|
+
|
|
25
|
+
url = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
|
|
26
|
+
urllib.request.urlretrieve(url, target)
|
|
27
|
+
log("setup_vad", f"Downloaded to {target}")
|
|
28
|
+
print(" silero_vad.onnx downloaded")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def setup_smart_turn(models_dir: str) -> None:
|
|
32
|
+
target = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
|
|
33
|
+
if os.path.exists(target):
|
|
34
|
+
print(" smart-turn-v3.2-cpu.onnx already exists")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
print(" Downloading Smart Turn ONNX model from HuggingFace...")
|
|
38
|
+
from huggingface_hub import hf_hub_download
|
|
39
|
+
|
|
40
|
+
path = hf_hub_download(
|
|
41
|
+
repo_id="pipecat-ai/smart-turn-v3",
|
|
42
|
+
filename="smart-turn-v3.2-cpu.onnx",
|
|
43
|
+
local_dir=models_dir,
|
|
44
|
+
)
|
|
45
|
+
log("setup_smart_turn", f"Downloaded to {path}")
|
|
46
|
+
print(" smart-turn-v3.2-cpu.onnx downloaded")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def setup_stt(models_dir: str) -> None:
|
|
50
|
+
model_name = os.environ.get("VOICE_MODEL_STT", "nvidia/parakeet-ctc-1.1b")
|
|
51
|
+
print(f" Downloading STT model: {model_name}...")
|
|
52
|
+
print(" (this may take a while on first run)")
|
|
53
|
+
|
|
54
|
+
import nemo.collections.asr as nemo_asr
|
|
55
|
+
|
|
56
|
+
nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
|
|
57
|
+
log("setup_stt", f"Model ready: {model_name}")
|
|
58
|
+
print(f" STT model ready: {model_name}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def main() -> None:
|
|
62
|
+
models_dir = get_models_dir()
|
|
63
|
+
os.makedirs(models_dir, exist_ok=True)
|
|
64
|
+
print(f"Models directory: {models_dir}\n")
|
|
65
|
+
|
|
66
|
+
print("[1/3] Silero VAD")
|
|
67
|
+
try:
|
|
68
|
+
setup_silero_vad(models_dir)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
log("setup_vad_error", str(e), level="error")
|
|
71
|
+
print(f" ERROR: {e}", file=sys.stderr)
|
|
72
|
+
|
|
73
|
+
print("\n[2/3] Smart Turn (pipecat-ai)")
|
|
74
|
+
try:
|
|
75
|
+
setup_smart_turn(models_dir)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
log("setup_smart_turn_error", str(e), level="error")
|
|
78
|
+
print(f" ERROR: {e}", file=sys.stderr)
|
|
79
|
+
|
|
80
|
+
print("\n[3/3] Parakeet STT (NeMo)")
|
|
81
|
+
try:
|
|
82
|
+
setup_stt(models_dir)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
log("setup_stt_error", str(e), level="error")
|
|
85
|
+
print(f" ERROR: {e}", file=sys.stderr)
|
|
86
|
+
|
|
87
|
+
print("\nSetup complete.")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Smart Turn end-of-utterance detection (ONNX) via pipecat-ai/smart-turn."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import onnxruntime as ort
|
|
7
|
+
from transformers import WhisperFeatureExtractor
|
|
8
|
+
|
|
9
|
+
from logger import log
|
|
10
|
+
|
|
11
|
+
END_THRESHOLD = 0.5
|
|
12
|
+
CHUNK_SECONDS = 8
|
|
13
|
+
SAMPLE_RATE = 16000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _truncate_or_pad(audio: np.ndarray) -> np.ndarray:
|
|
17
|
+
max_samples = CHUNK_SECONDS * SAMPLE_RATE
|
|
18
|
+
if len(audio) > max_samples:
|
|
19
|
+
return audio[-max_samples:]
|
|
20
|
+
if len(audio) < max_samples:
|
|
21
|
+
padding = max_samples - len(audio)
|
|
22
|
+
return np.pad(audio, (padding, 0), mode="constant", constant_values=0)
|
|
23
|
+
return audio
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SmartTurn:
|
|
27
|
+
def __init__(self):
|
|
28
|
+
model_path = os.environ.get("VOICE_MODEL_SMART_TURN")
|
|
29
|
+
if not model_path:
|
|
30
|
+
models_dir = os.environ.get(
|
|
31
|
+
"VOICE_MODELS_DIR",
|
|
32
|
+
os.path.expanduser("~/.assist/voice/models"),
|
|
33
|
+
)
|
|
34
|
+
model_path = os.path.join(models_dir, "smart-turn-v3.2-cpu.onnx")
|
|
35
|
+
|
|
36
|
+
log("smart_turn_init", f"model={model_path}")
|
|
37
|
+
so = ort.SessionOptions()
|
|
38
|
+
so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
|
39
|
+
so.inter_op_num_threads = 1
|
|
40
|
+
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
41
|
+
self._session = ort.InferenceSession(
|
|
42
|
+
model_path, sess_options=so, providers=["CPUExecutionProvider"]
|
|
43
|
+
)
|
|
44
|
+
self._feature_extractor = WhisperFeatureExtractor(chunk_length=CHUNK_SECONDS)
|
|
45
|
+
self.threshold = END_THRESHOLD
|
|
46
|
+
|
|
47
|
+
def is_end_of_turn(self, audio: np.ndarray) -> bool:
|
|
48
|
+
"""Check if the accumulated audio indicates end of utterance."""
|
|
49
|
+
audio = _truncate_or_pad(audio)
|
|
50
|
+
inputs = self._feature_extractor(
|
|
51
|
+
audio,
|
|
52
|
+
sampling_rate=SAMPLE_RATE,
|
|
53
|
+
return_tensors="np",
|
|
54
|
+
padding="max_length",
|
|
55
|
+
max_length=CHUNK_SECONDS * SAMPLE_RATE,
|
|
56
|
+
truncation=True,
|
|
57
|
+
do_normalize=True,
|
|
58
|
+
)
|
|
59
|
+
features = inputs.input_features.squeeze(0).astype(np.float32)
|
|
60
|
+
features = np.expand_dims(features, axis=0)
|
|
61
|
+
outputs = self._session.run(None, {"input_features": features})
|
|
62
|
+
prob = float(outputs[0][0].item())
|
|
63
|
+
return prob > self.threshold
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Parakeet NeMo STT wrapper (GPU)."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from logger import log
|
|
7
|
+
|
|
8
|
+
DEFAULT_MODEL = "nvidia/parakeet-ctc-1.1b"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ParakeetSTT:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
model_name = os.environ.get("VOICE_MODEL_STT", DEFAULT_MODEL)
|
|
14
|
+
log("stt_init", f"model={model_name}")
|
|
15
|
+
|
|
16
|
+
import nemo.collections.asr as nemo_asr
|
|
17
|
+
|
|
18
|
+
self._model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name)
|
|
19
|
+
self._model.eval()
|
|
20
|
+
log("stt_ready")
|
|
21
|
+
|
|
22
|
+
def transcribe(self, audio: np.ndarray, sample_rate: int = 16000) -> str:
|
|
23
|
+
"""Transcribe audio buffer to text via direct forward pass."""
|
|
24
|
+
import torch
|
|
25
|
+
|
|
26
|
+
audio_tensor = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
|
|
27
|
+
audio_len = torch.tensor([audio.shape[0]], dtype=torch.long)
|
|
28
|
+
|
|
29
|
+
with torch.no_grad():
|
|
30
|
+
logits, logits_len, _ = self._model.forward(
|
|
31
|
+
input_signal=audio_tensor, input_signal_length=audio_len
|
|
32
|
+
)
|
|
33
|
+
# Greedy CTC decode
|
|
34
|
+
preds = torch.argmax(logits, dim=-1)
|
|
35
|
+
text = self._model.decoding.ctc_decoder_predictions_tensor(
|
|
36
|
+
preds, decoder_lengths=logits_len
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Result may be nested: tuple of lists of Hypothesis objects
|
|
40
|
+
if isinstance(text, tuple):
|
|
41
|
+
text = text[0]
|
|
42
|
+
if isinstance(text, list):
|
|
43
|
+
text = text[0]
|
|
44
|
+
# NeMo returns Hypothesis namedtuples with a .text field
|
|
45
|
+
if hasattr(text, "text"):
|
|
46
|
+
text = text.text
|
|
47
|
+
if not isinstance(text, str):
|
|
48
|
+
text = str(text)
|
|
49
|
+
|
|
50
|
+
log("stt_result", text)
|
|
51
|
+
return text
|