@nicfox77/parakeet-stt 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # Parakeet STT for OpenClaw
2
+
3
+ Fast CPU-based speech-to-text using NVIDIA's Parakeet TDT INT8 models.
4
+
5
+ ## Features
6
+
7
+ - **4x faster than real-time** (0.25x RTF)
8
+ - **CPU-only** - no GPU required
9
+ - **Two model versions:**
10
+ - **V2** - English optimized (higher accuracy for English)
11
+ - **V3** - Multilingual (25 European languages, auto-detect)
12
+ - **Lazy loading** - model loads on first transcription, unloads after inactivity
13
+
14
+ ## Installation
15
+
16
+ ### 1. Install the plugin
17
+
18
+ ```bash
19
+ openclaw plugins install @nicfox77/parakeet-stt
20
+ ```
21
+
22
+ ### 2. Install a model
23
+
24
+ ```bash
25
+ # English optimized (default)
26
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
27
+
28
+ # Or multilingual
29
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
30
+ ```
31
+
32
+ This downloads the pre-quantized INT8 model (~475MB) from the [Handy project](https://github.com/cjpais/Handy).
33
+
34
+ ### 3. Configure OpenClaw
35
+
36
+ Add to your `openclaw.json`:
37
+
38
+ ```json
39
+ {
40
+ "tools": {
41
+ "media": {
42
+ "audio": {
43
+ "enabled": true,
44
+ "models": [
45
+ {
46
+ "type": "cli",
47
+ "command": "/home/YOUR_USER/.openclaw/tools/parakeet/parakeet-audio-client.py",
48
+ "args": ["{{MediaPath}}", "{{OutputDir}}"]
49
+ }
50
+ ]
51
+ }
52
+ }
53
+ },
54
+ "plugins": {
55
+ "entries": {
56
+ "parakeet-stt": {
57
+ "enabled": true,
58
+ "modelVersion": "v2"
59
+ }
60
+ }
61
+ }
62
+ }
63
+ ```
64
+
65
+ ## Switching Models
66
+
67
+ ```bash
68
+ # Switch to V2 (English)
69
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
70
+
71
+ # Switch to V3 (Multilingual)
72
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
73
+ ```
74
+
75
+ The install script updates a symlink, so the daemon automatically uses the new model on next load.
76
+
77
+ ## CLI Commands
78
+
79
+ ```bash
80
+ # Check status
81
+ openclaw parakeet:status
82
+
83
+ # Install model
84
+ openclaw parakeet:install v2
85
+ ```
86
+
87
+ ## Requirements
88
+
89
+ - Python 3.8+
90
+ - ~500MB disk space for model
91
+ - ~500MB RAM when model loaded
92
+
93
+ ## Credits
94
+
95
+ - Models: [NVIDIA Parakeet TDT](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
96
+ - INT8 Quantization: [Handy](https://github.com/cjpais/Handy) by cjpais
97
+ - ONNX Runtime for inference
98
+
99
+ ## License
100
+
101
+ MIT
package/index.ts ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Parakeet STT Plugin for OpenClaw
3
+ *
4
+ * Provides fast CPU-based speech-to-text using Parakeet TDT INT8 models.
5
+ * Supports V2 (English optimized) and V3 (Multilingual) model selection.
6
+ *
7
+ * The actual transcription is configured via tools.media.audio.models in openclaw.json.
8
+ */
9
+
10
+ import { Type } from "@sinclair/typebox";
11
+
12
+ export default function (api: any) {
13
+ api.logger.info("parakeet-stt: plugin loaded");
14
+
15
+ // Register a CLI command for checking Parakeet status
16
+ api.registerCommand({
17
+ name: "parakeet:status",
18
+ description: "Check Parakeet STT daemon status",
19
+ async handler() {
20
+ const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
21
+ const modelVersion = cfg.modelVersion || "v2";
22
+ const toolsDir = `${process.env.HOME}/.openclaw/tools/parakeet`;
23
+ const modelPath = cfg.modelPath || `${toolsDir}/model`;
24
+
25
+ return {
26
+ modelVersion,
27
+ modelPath,
28
+ daemonPath: `${toolsDir}/parakeet-lazy-daemon.py`,
29
+ enabled: cfg.enabled !== false,
30
+ timeout: cfg.timeoutMs || 30000,
31
+ inactivityTimeout: (cfg.inactivityTimeoutMin || 20) + " minutes",
32
+ installCommand: `bash ~/.openclaw/extensions/parakeet-stt/scripts/install.sh ${modelVersion}`
33
+ };
34
+ },
35
+ });
36
+
37
+ // Register a CLI command for installing the model
38
+ api.registerCommand({
39
+ name: "parakeet:install",
40
+ description: "Download and install a Parakeet TDT model",
41
+ async handler(args: { version?: string }) {
42
+ const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
43
+ const version = args?.version || cfg.modelVersion || "v2";
44
+ const installScript = `${process.env.HOME}/.openclaw/extensions/parakeet-stt/scripts/install.sh`;
45
+ api.logger.info?.(`parakeet-stt: install command called for ${version}`);
46
+
47
+ return {
48
+ message: `Run the install script for ${version}:`,
49
+ command: `bash ${installScript} ${version}`,
50
+ hint: "v2 = English optimized, v3 = Multilingual (25 languages)"
51
+ };
52
+ },
53
+ });
54
+
55
+ // Register an agent tool for checking transcription status
56
+ api.registerTool(
57
+ {
58
+ name: "parakeet_status",
59
+ description: "Check the status of the Parakeet speech-to-text system",
60
+ parameters: Type.Object({}),
61
+ async execute() {
62
+ const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
63
+ const modelVersion = cfg.modelVersion || "v2";
64
+ const modelPath = cfg.modelPath || `${process.env.HOME}/.openclaw/tools/parakeet/model`;
65
+
66
+ return {
67
+ content: [
68
+ {
69
+ type: "text",
70
+ text: JSON.stringify({
71
+ enabled: cfg.enabled !== false,
72
+ modelVersion,
73
+ modelPath,
74
+ configured: !!cfg.enabled
75
+ }, null, 2)
76
+ }
77
+ ]
78
+ };
79
+ },
80
+ },
81
+ { optional: true }
82
+ );
83
+ }
@@ -0,0 +1,71 @@
1
+ {
2
+ "id": "parakeet-stt",
3
+ "name": "Parakeet STT",
4
+ "description": "Fast CPU-based speech-to-text using Parakeet TDT INT8 models. 4x faster than real-time. Supports V2 (English optimized) and V3 (multilingual).",
5
+ "version": "0.2.0",
6
+ "skills": ["skills/parakeet"],
7
+ "configSchema": {
8
+ "type": "object",
9
+ "additionalProperties": false,
10
+ "properties": {
11
+ "enabled": {
12
+ "type": "boolean"
13
+ },
14
+ "modelVersion": {
15
+ "type": "string",
16
+ "enum": ["v2", "v3"],
17
+ "default": "v2",
18
+ "description": "Model version: v2 for English (higher accuracy), v3 for multilingual (25 languages, auto-detect)"
19
+ },
20
+ "modelPath": {
21
+ "type": "string",
22
+ "description": "Path to the Parakeet ONNX model directory (auto-set based on modelVersion)"
23
+ },
24
+ "daemonPath": {
25
+ "type": "string",
26
+ "description": "Path to the parakeet daemon script"
27
+ },
28
+ "timeoutMs": {
29
+ "type": "integer",
30
+ "minimum": 1000,
31
+ "default": 30000,
32
+ "description": "Timeout for transcription requests"
33
+ },
34
+ "inactivityTimeoutMin": {
35
+ "type": "integer",
36
+ "minimum": 1,
37
+ "default": 20,
38
+ "description": "Minutes of inactivity before unloading the model"
39
+ }
40
+ }
41
+ },
42
+ "uiHints": {
43
+ "enabled": {
44
+ "label": "Enable Parakeet STT"
45
+ },
46
+ "modelVersion": {
47
+ "label": "Model Version",
48
+ "help": "v2 = English optimized (higher accuracy), v3 = Multilingual (25 European languages with auto-detect)"
49
+ },
50
+ "modelPath": {
51
+ "label": "Model Path",
52
+ "placeholder": "~/.openclaw/tools/parakeet/model",
53
+ "help": "Directory containing the Parakeet ONNX model files",
54
+ "advanced": true
55
+ },
56
+ "daemonPath": {
57
+ "label": "Daemon Script Path",
58
+ "placeholder": "~/.openclaw/tools/parakeet/parakeet-lazy-daemon.py",
59
+ "advanced": true
60
+ },
61
+ "timeoutMs": {
62
+ "label": "Request Timeout (ms)",
63
+ "advanced": true
64
+ },
65
+ "inactivityTimeoutMin": {
66
+ "label": "Inactivity Timeout (minutes)",
67
+ "help": "How long before the model unloads to save memory",
68
+ "advanced": true
69
+ }
70
+ }
71
+ }
package/package.json ADDED
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "@nicfox77/parakeet-stt",
3
+ "version": "0.2.0",
4
+ "description": "Parakeet TDT INT8 speech-to-text plugin for OpenClaw. Supports V2 (English) and V3 (Multilingual) models.",
5
+ "type": "module",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "git+https://github.com/Nicfox77/openclaw-parakeet-stt.git"
9
+ },
10
+ "author": "Nicfox77",
11
+ "license": "MIT",
12
+ "keywords": [
13
+ "openclaw",
14
+ "parakeet",
15
+ "stt",
16
+ "speech-to-text",
17
+ "transcription",
18
+ "asr",
19
+ "nvidia"
20
+ ],
21
+ "dependencies": {
22
+ "@sinclair/typebox": "0.34.48"
23
+ },
24
+ "openclaw": {
25
+ "extensions": ["./index.ts"]
26
+ }
27
+ }
@@ -0,0 +1,137 @@
1
+ #!/bin/bash
2
+ # Parakeet TDT INT8 Model Installer
3
+ # Downloads pre-quantized INT8 models from the Handy project
4
+ # https://github.com/cjpais/Handy
5
+
6
+ set -e
7
+
8
+ # Configuration
9
+ PARAKEET_DIR="${PARAKEET_DIR:-$HOME/.openclaw/tools/parakeet}"
10
+ VENV_DIR="$PARAKEET_DIR/.venv"
11
+
12
+ # Model URLs (from Handy project)
13
+ MODEL_URLS_V2="https://blob.handy.computer/parakeet-v2-int8.tar.gz"
14
+ MODEL_URLS_V3="https://blob.handy.computer/parakeet-v3-int8.tar.gz"
15
+
16
+ # Default to V2 (English optimized)
17
+ VERSION="${1:-v2}"
18
+
19
+ # Validate version
20
+ if [[ "$VERSION" != "v2" && "$VERSION" != "v3" ]]; then
21
+ echo "Usage: $0 [v2|v3]"
22
+ echo " v2 - English optimized (higher accuracy for English)"
23
+ echo " v3 - Multilingual (25 European languages, auto-detect)"
24
+ exit 1
25
+ fi
26
+
27
+ # Select URL based on version
28
+ if [[ "$VERSION" == "v2" ]]; then
29
+ MODEL_URL="$MODEL_URLS_V2"
30
+ MODEL_DIR="$PARAKEET_DIR/model-v2"
31
+ MODEL_SIZE="473MB"
32
+ MODEL_DESC="English optimized"
33
+ else
34
+ MODEL_URL="$MODEL_URLS_V3"
35
+ MODEL_DIR="$PARAKEET_DIR/model-v3"
36
+ MODEL_SIZE="478MB"
37
+ MODEL_DESC="Multilingual (25 languages)"
38
+ fi
39
+
40
+ # Create symlink to active model
41
+ ACTIVE_MODEL_LINK="$PARAKEET_DIR/model"
42
+
43
+ echo "=== Parakeet TDT $VERSION INT8 Installer ==="
44
+ echo "Model: $MODEL_DESC"
45
+ echo ""
46
+
47
+ # Check Python
48
+ if ! command -v python3 &> /dev/null; then
49
+ echo "Error: Python 3 is required"
50
+ exit 1
51
+ fi
52
+
53
+ echo "Python version: $(python3 --version)"
54
+
55
+ # Create directories
56
+ mkdir -p "$PARAKEET_DIR"
57
+
58
+ # Create virtual environment (reuse existing if present)
59
+ if [ ! -d "$VENV_DIR" ]; then
60
+ echo "Creating Python virtual environment..."
61
+ python3 -m venv "$VENV_DIR"
62
+ fi
63
+
64
+ # Activate venv
65
+ source "$VENV_DIR/bin/activate"
66
+
67
+ # Install minimal dependencies (Handy models just need onnxruntime + librosa)
68
+ echo "Installing Python dependencies..."
69
+ pip install --upgrade pip
70
+ pip install onnxruntime librosa soundfile
71
+
72
+ # Download and extract model if not present
73
+ if [ ! -d "$MODEL_DIR" ] || [ ! -f "$MODEL_DIR/model.onnx" ]; then
74
+ echo ""
75
+ echo "Downloading Parakeet TDT $VERSION model (~$MODEL_SIZE)..."
76
+ echo "URL: $MODEL_URL"
77
+
78
+ TMP_TAR="/tmp/parakeet-$VERSION-int8.tar.gz"
79
+
80
+ if command -v wget &> /dev/null; then
81
+ wget -O "$TMP_TAR" "$MODEL_URL"
82
+ elif command -v curl &> /dev/null; then
83
+ curl -L -o "$TMP_TAR" "$MODEL_URL"
84
+ else
85
+ echo "Error: wget or curl required for download"
86
+ exit 1
87
+ fi
88
+
89
+ echo "Extracting model..."
90
+ mkdir -p "$MODEL_DIR"
91
+ tar -xzf "$TMP_TAR" -C "$MODEL_DIR" --strip-components=1 2>/dev/null || {
92
+ # If strip-components fails, try without it
93
+ tar -xzf "$TMP_TAR" -C "$MODEL_DIR"
94
+ # Move files from subdirectory if needed
95
+ for subdir in "$MODEL_DIR"/*/; do
96
+ if [ -d "$subdir" ]; then
97
+ mv "$subdir"* "$MODEL_DIR/" 2>/dev/null || true
98
+ rmdir "$subdir" 2>/dev/null || true
99
+ fi
100
+ done
101
+ }
102
+ rm -f "$TMP_TAR"
103
+
104
+ echo "Model downloaded and extracted successfully"
105
+ else
106
+ echo "Model already exists at $MODEL_DIR"
107
+ fi
108
+
109
+ # Update symlink to active model
110
+ rm -f "$ACTIVE_MODEL_LINK"
111
+ ln -s "$MODEL_DIR" "$ACTIVE_MODEL_LINK"
112
+ echo "Active model symlink: $ACTIVE_MODEL_LINK -> $MODEL_DIR"
113
+
114
+ # Copy scripts from extension to tools directory
115
+ SCRIPTS_SRC="$HOME/.openclaw/extensions/parakeet-stt/scripts"
116
+ for script in parakeet-lazy-daemon.py parakeet-audio-client.py parakeet_transcribe.py; do
117
+ if [ -f "$SCRIPTS_SRC/$script" ]; then
118
+ cp "$SCRIPTS_SRC/$script" "$PARAKEET_DIR/"
119
+ chmod +x "$PARAKEET_DIR/$script"
120
+ echo "Copied $script"
121
+ fi
122
+ done
123
+
124
+ echo ""
125
+ echo "=== Installation Complete ==="
126
+ echo ""
127
+ echo "Version: $VERSION ($MODEL_DESC)"
128
+ echo "Model directory: $MODEL_DIR"
129
+ echo "Active model: $ACTIVE_MODEL_LINK"
130
+ echo "Virtual environment: $VENV_DIR"
131
+ echo ""
132
+ echo "To switch models, run:"
133
+ echo " $0 v2 # English optimized"
134
+ echo " $0 v3 # Multilingual"
135
+ echo ""
136
+ echo "OpenClaw config (already configured):"
137
+ echo ' tools.media.audio.models → parakeet-audio-client.py'
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python3
2
+ import json
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ import time
8
+
9
+ SOCKET_PATH = "/tmp/parakeet-lazy.sock"
10
+ DAEMON_PATH = os.path.expanduser("~/.openclaw/tools/parakeet/parakeet-lazy-daemon.py")
11
+
12
+ def ensure_daemon():
13
+ # Check if daemon socket exists and responsive
14
+ try:
15
+ with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
16
+ s.settimeout(0.5)
17
+ s.connect(SOCKET_PATH)
18
+ return # daemon already running
19
+ except Exception:
20
+ pass
21
+ # Start daemon in background
22
+ try:
23
+ subprocess.Popen(
24
+ [sys.executable, DAEMON_PATH],
25
+ stdout=subprocess.DEVNULL,
26
+ stderr=subprocess.DEVNULL,
27
+ start_new_session=True
28
+ )
29
+ time.sleep(1) # give it a moment to start
30
+ except Exception as e:
31
+ print(f"Failed to start daemon: {e}", file=sys.stderr)
32
+ sys.exit(1)
33
+
34
+ def query_daemon(audio_path):
35
+ for attempt in range(3):
36
+ try:
37
+ with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
38
+ s.connect(SOCKET_PATH)
39
+ request = {"action": "transcribe", "audio_path": audio_path}
40
+ s.sendall(json.dumps(request).encode() + b"\n")
41
+ response_data = b""
42
+ while True:
43
+ chunk = s.recv(4096)
44
+ if not chunk:
45
+ break
46
+ response_data += chunk
47
+ if b"\n" in chunk:
48
+ break
49
+ if response_data:
50
+ response = json.loads(response_data.strip())
51
+ if "text" in response:
52
+ print(response["text"])
53
+ return 0
54
+ else:
55
+ print(response.get("error", "Unknown error"), file=sys.stderr)
56
+ return 1
57
+ else:
58
+ time.sleep(0.5)
59
+ except Exception as e:
60
+ if attempt == 2:
61
+ print(f"Daemon communication failed: {e}", file=sys.stderr)
62
+ return 1
63
+ time.sleep(0.5)
64
+ return 1
65
+
66
+ if __name__ == "__main__":
67
+ if len(sys.argv) < 2:
68
+ print("Usage: parakeet-audio-client.py <audio_path> [output_dir]", file=sys.stderr)
69
+ sys.exit(1)
70
+ audio_path = sys.argv[1]
71
+ # output_dir = sys.argv[2] if len(sys.argv) > 2 else None
72
+ ensure_daemon()
73
+ sys.exit(query_daemon(audio_path))
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Parakeet Lazy Daemon - Loads model on demand, unloads after inactivity.
4
+ Supports V2 (English) and V3 (Multilingual) model selection.
5
+ """
6
+ import json
7
+ import os
8
+ import signal
9
+ import socket
10
+ import sys
11
+ import time
12
+ from pathlib import Path
13
+
14
+ # Add script directory to path to import ParakeetTDT
15
+ SCRIPT_DIR = Path(__file__).parent
16
+ sys.path.insert(0, str(SCRIPT_DIR))
17
+
18
+ # Try to import, will fail if model not installed yet
19
+ try:
20
+ from parakeet_transcribe import ParakeetTDT
21
+ except ImportError:
22
+ print("Error: parakeet_transcribe not found. Run install.sh first.", file=sys.stderr)
23
+ sys.exit(1)
24
+
25
+ SOCKET_PATH = "/tmp/parakeet-lazy.sock"
26
+ IDLE_TIMEOUT = 20 * 60 # 20 minutes
27
+
28
+ def get_model_path():
29
+ """Determine which model to use based on config/env."""
30
+ tools_dir = Path.home() / ".openclaw" / "tools" / "parakeet"
31
+
32
+ # 1. Check for explicit symlink (created by install.sh)
33
+ symlink = tools_dir / "model"
34
+ if symlink.is_symlink() or symlink.is_dir():
35
+ return symlink.resolve()
36
+
37
+ # 2. Check environment variable
38
+ model_version = os.environ.get("PARAKEET_MODEL_VERSION", "").lower()
39
+ if model_version in ("v2", "2"):
40
+ return tools_dir / "model-v2"
41
+ if model_version in ("v3", "3"):
42
+ return tools_dir / "model-v3"
43
+
44
+ # 3. Check for installed models, prefer v2 (English)
45
+ for version in ["v2", "v3"]:
46
+ model_dir = tools_dir / f"model-{version}"
47
+ if model_dir.is_dir():
48
+ return model_dir
49
+
50
+ # 4. Fallback to symlink path (will error if not installed)
51
+ return symlink
52
+
53
+
54
+ class ParakeetLazyDaemon:
55
+ def __init__(self):
56
+ self.model_dir = get_model_path()
57
+ self.transcriber = None
58
+ self.last_used = None
59
+ self.running = True
60
+
61
+ # Validate model exists
62
+ if not self.model_dir.is_dir():
63
+ print(f"Error: Model not found at {self.model_dir}", file=sys.stderr)
64
+ print("Run: ~/.openclaw/extensions/parakeet-stt/scripts/install.sh [v2|v3]", file=sys.stderr)
65
+ sys.exit(1)
66
+
67
+ print(f"Using model: {self.model_dir}", file=sys.stderr)
68
+
69
+ # Clean up any existing socket
70
+ try:
71
+ os.unlink(SOCKET_PATH)
72
+ except OSError:
73
+ if os.path.exists(SOCKET_PATH):
74
+ raise
75
+
76
+ self.server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
77
+ self.server.bind(SOCKET_PATH)
78
+ os.chmod(SOCKET_PATH, 0o666) # world readable/writable
79
+ self.server.listen(1)
80
+ signal.signal(signal.SIGTERM, self._handle_signal)
81
+ signal.signal(signal.SIGINT, self._handle_signal)
82
+
83
+ def _handle_signal(self, signum, frame):
84
+ self.running = False
85
+
86
+ def ensure_loaded(self):
87
+ if self.transcriber is None:
88
+ print("Loading Parakeet model...", file=sys.stderr)
89
+ self.transcriber = ParakeetTDT(str(self.model_dir))
90
+ print("Model loaded.", file=sys.stderr)
91
+ self.last_used = time.time()
92
+
93
+ def unload_if_idle(self):
94
+ if self.transcriber is None or self.last_used is None:
95
+ return
96
+ idle = time.time() - self.last_used
97
+ if idle > IDLE_TIMEOUT:
98
+ print(f"Unloading model (idle {idle:.1f}s)", file=sys.stderr)
99
+ self.transcriber = None
100
+ import gc
101
+ gc.collect()
102
+
103
+ def handle_connection(self, conn):
104
+ response = None
105
+ try:
106
+ data = b""
107
+ while True:
108
+ chunk = conn.recv(4096)
109
+ if not chunk:
110
+ break
111
+ data += chunk
112
+ if b"\n" in chunk:
113
+ break
114
+ if not data:
115
+ response = {"error": "Empty request"}
116
+ else:
117
+ line = data.split(b"\n", 1)[0].strip()
118
+ request = json.loads(line.decode())
119
+ action = request.get("action")
120
+ if action != "transcribe":
121
+ response = {"error": f"Unsupported action: {action}"}
122
+ else:
123
+ audio_path = request["audio_path"]
124
+ self.ensure_loaded()
125
+ audio = self.transcriber.load_audio(audio_path)
126
+ text, tokens, timestamps = self.transcriber.transcribe(audio)
127
+ self.last_used = time.time()
128
+ response = {"text": text, "tokens": tokens, "timestamps": timestamps}
129
+ except Exception as e:
130
+ response = {"error": str(e)}
131
+ finally:
132
+ if response is not None:
133
+ conn.sendall(json.dumps(response).encode() + b"\n")
134
+ conn.close()
135
+ self.unload_if_idle()
136
+
137
+ def run(self):
138
+ print("ParakeetLazyDaemon listening on", SOCKET_PATH, file=sys.stderr)
139
+ while self.running:
140
+ try:
141
+ conn, addr = self.server.accept()
142
+ self.handle_connection(conn)
143
+ except socket.timeout:
144
+ continue
145
+ except Exception as e:
146
+ print("Daemon error:", e, file=sys.stderr)
147
+ continue # Keep serving instead of breaking
148
+ self.server.close()
149
+ try:
150
+ os.unlink(SOCKET_PATH)
151
+ except OSError:
152
+ pass
153
+
154
+ if __name__ == "__main__":
155
+ daemon = ParakeetLazyDaemon()
156
+ daemon.run()
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Parakeet TDT V2 INT8 Transcription Script
4
+ Based on transcribe-rs implementation: https://github.com/cjpais/transcribe-rs
5
+ """
6
+
7
+ import argparse
8
+ import json
9
+ import time
10
+ import re
11
+ import numpy as np
12
+ from pathlib import Path
13
+
14
+ import onnxruntime as ort
15
+ import librosa
16
+
17
+
18
+ class ParakeetTDT:
19
+ """Parakeet TDT V2 INT8 transcriber using ONNX Runtime."""
20
+
21
+ # Constants from transcribe-rs
22
+ SUBSAMPLING_FACTOR = 8
23
+ WINDOW_SIZE = 0.01
24
+ MAX_TOKENS_PER_STEP = 10
25
+
26
+ def __init__(self, model_dir: str):
27
+ model_dir = Path(model_dir)
28
+
29
+ # Load config
30
+ config_path = model_dir / "config.json"
31
+ with open(config_path) as f:
32
+ self.config = json.load(f)
33
+
34
+ # Load vocabulary
35
+ self.vocab, self.blank_idx = self._load_vocab(model_dir / "vocab.txt")
36
+ self.vocab_size = len(self.vocab)
37
+ print(f"Loaded vocabulary: {self.vocab_size} tokens, blank_idx={self.blank_idx}")
38
+
39
+ self.sample_rate = 16000
40
+
41
+ # Create ONNX sessions
42
+ sess_options = ort.SessionOptions()
43
+ sess_options.intra_op_num_threads = 4
44
+ sess_options.inter_op_num_threads = 4
45
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
46
+
47
+ print(f"Loading preprocessor...")
48
+ start = time.time()
49
+ self.preprocessor = ort.InferenceSession(
50
+ str(model_dir / "nemo128.onnx"), sess_options)
51
+ print(f"Preprocessor loaded in {time.time() - start:.2f}s")
52
+
53
+ print(f"Loading encoder...")
54
+ start = time.time()
55
+ self.encoder = ort.InferenceSession(
56
+ str(model_dir / "encoder-model.int8.onnx"), sess_options)
57
+ print(f"Encoder loaded in {time.time() - start:.2f}s")
58
+
59
+ print(f"Loading decoder...")
60
+ start = time.time()
61
+ self.decoder = ort.InferenceSession(
62
+ str(model_dir / "decoder_joint-model.int8.onnx"), sess_options)
63
+ print(f"Decoder loaded in {time.time() - start:.2f}s")
64
+
65
+ def _load_vocab(self, vocab_path: Path) -> tuple:
66
+ """Load vocabulary from vocab.txt file."""
67
+ vocab = {}
68
+ blank_idx = 0
69
+
70
+ with open(vocab_path) as f:
71
+ for line in f:
72
+ parts = line.rstrip().split(' ')
73
+ if len(parts) >= 2:
74
+ token = parts[0]
75
+ # Replace SentencePiece space marker with actual space
76
+ token = token.replace('\u2581', ' ')
77
+ idx = int(parts[1])
78
+ vocab[idx] = token
79
+ if token.strip() == '<blk>':
80
+ blank_idx = idx
81
+
82
+ return vocab, blank_idx
83
+
84
+ def load_audio(self, audio_path: str) -> np.ndarray:
85
+ """Load audio file and convert to 16kHz mono float32."""
86
+ audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
87
+ return audio.astype(np.float32)
88
+
89
+ def transcribe(self, audio: np.ndarray) -> tuple:
90
+ """Transcribe audio using Parakeet TDT model."""
91
+ # Prepare audio
92
+ waveforms = audio.reshape(1, -1)
93
+ waveforms_lens = np.array([audio.shape[0]], dtype=np.int64)
94
+
95
+ # Preprocess (mel spectrogram)
96
+ start = time.time()
97
+ prep_out = self.preprocessor.run(
98
+ None,
99
+ {'waveforms': waveforms, 'waveforms_lens': waveforms_lens}
100
+ )
101
+ features, features_lens = prep_out[0], prep_out[1]
102
+ print(f"Preprocessor: {time.time() - start:.3f}s, features shape: {features.shape}")
103
+
104
+ # Encode
105
+ start = time.time()
106
+ enc_out = self.encoder.run(
107
+ None,
108
+ {'audio_signal': features, 'length': features_lens}
109
+ )
110
+ encoder_out = enc_out[0] # [1, 1024, time]
111
+ encoder_out_lens = enc_out[1]
112
+ # Transpose to [1, time, 1024] like transcribe-rs
113
+ encoder_out = encoder_out.transpose(0, 2, 1)
114
+ print(f"Encoder: {time.time() - start:.3f}s, encoded shape: {encoder_out.shape}")
115
+
116
+ # Decode
117
+ start = time.time()
118
+ tokens, timestamps = self._decode_sequence(
119
+ encoder_out[0], int(encoder_out_lens[0]))
120
+ decode_time = time.time() - start
121
+ print(f"Decode: {decode_time:.3f}s")
122
+
123
+ # Convert to text
124
+ text = self._decode_tokens(tokens)
125
+
126
+ return text, tokens, timestamps
127
+
128
+ def _create_decoder_state(self) -> tuple:
129
+ """Create initial decoder state (LSTM hidden states)."""
130
+ # Shape: [2, 1, 640] for batch_size=1
131
+ state1 = np.zeros((2, 1, 640), dtype=np.float32)
132
+ state2 = np.zeros((2, 1, 640), dtype=np.float32)
133
+ return state1, state2
134
+
135
+ def _decode_step(self, prev_tokens: list, prev_state: tuple,
136
+ encoder_step: np.ndarray) -> tuple:
137
+ """Run one decoder step.
138
+
139
+ Args:
140
+ prev_tokens: Previously emitted tokens
141
+ prev_state: Previous decoder state (state1, state2)
142
+ encoder_step: Encoder output for current frame [1024]
143
+
144
+ Returns:
145
+ (logits, new_state)
146
+ """
147
+ # Get last token or blank if empty
148
+ target_token = prev_tokens[-1] if prev_tokens else self.blank_idx
149
+
150
+ # Prepare inputs
151
+ # encoder_outputs: [1, 1024, 1] (batch, dim, time) - matches ONNX input shape
152
+ encoder_outputs = encoder_step.reshape(1, -1, 1).astype(np.float32)
153
+ targets = np.array([[target_token]], dtype=np.int32)
154
+ target_length = np.array([1], dtype=np.int32) # Must be int32 for this model
155
+ state1, state2 = prev_state
156
+
157
+ outputs = self.decoder.run(
158
+ None,
159
+ {
160
+ 'encoder_outputs': encoder_outputs,
161
+ 'targets': targets,
162
+ 'target_length': target_length,
163
+ 'input_states_1': state1,
164
+ 'input_states_2': state2,
165
+ }
166
+ )
167
+
168
+ logits = outputs[0] # [1, 1, vocab_size + duration]
169
+ new_state1 = outputs[2]
170
+ new_state2 = outputs[3]
171
+
172
+ return logits[0, 0], (new_state1, new_state2)
173
+
174
+ def _decode_sequence(self, encodings: np.ndarray,
175
+ encodings_len: int) -> tuple:
176
+ """Decode encoded sequence using greedy algorithm.
177
+
178
+ Implements TDT decoding with MAX_TOKENS_PER_STEP limit.
179
+ """
180
+ prev_state = self._create_decoder_state()
181
+ tokens = []
182
+ timestamps = []
183
+
184
+ t = 0
185
+ emitted_tokens = 0
186
+
187
+ while t < encodings_len:
188
+ encoder_step = encodings[t] # [1024]
189
+ logits, new_state = self._decode_step(tokens, prev_state, encoder_step)
190
+
191
+ # For TDT: split into vocab logits and duration logits
192
+ if len(logits) > self.vocab_size:
193
+ vocab_logits = logits[:self.vocab_size]
194
+ # Duration logits not used in basic greedy decoding
195
+ else:
196
+ vocab_logits = logits
197
+
198
+ # Get argmax token
199
+ token = int(np.argmax(vocab_logits))
200
+
201
+ # Process non-blank token
202
+ if token != self.blank_idx:
203
+ prev_state = new_state
204
+ tokens.append(token)
205
+ timestamps.append(t)
206
+ emitted_tokens += 1
207
+
208
+ # Advance frame on blank OR after max tokens per step
209
+ if token == self.blank_idx or emitted_tokens == self.MAX_TOKENS_PER_STEP:
210
+ t += 1
211
+ emitted_tokens = 0
212
+
213
+ return tokens, timestamps
214
+
215
+ def _decode_tokens(self, ids: list) -> str:
216
+ """Convert token IDs to text."""
217
+ tokens = []
218
+ for token_id in ids:
219
+ if token_id < len(self.vocab):
220
+ token = self.vocab[token_id]
221
+ # SentencePiece uses '▁' (U+2581) to mark word starts
222
+ # Replace with space for proper word separation
223
+ if token.startswith(' '):
224
+ tokens.append(token) # Already has leading space
225
+ else:
226
+ tokens.append(token)
227
+
228
+ # Join all tokens - spaces are already embedded
229
+ text = ''.join(tokens)
230
+
231
+ # Clean up multiple spaces
232
+ text = re.sub(r' +', ' ', text)
233
+
234
+ return text.strip()
235
+
236
+
237
+ def main():
238
+ parser = argparse.ArgumentParser(description='Parakeet TDT V2 INT8 Transcription')
239
+ parser.add_argument('audio', help='Path to audio file')
240
+ parser.add_argument('--model', default='~/.openclaw/models/parakeet-tdt-0.6b-v2-int8',
241
+ help='Path to model directory')
242
+ args = parser.parse_args()
243
+
244
+ model_path = Path(args.model).expanduser()
245
+ audio_path = args.audio
246
+
247
+ print(f"Loading model from {model_path}...")
248
+ transcriber = ParakeetTDT(model_path)
249
+
250
+ print(f"\nLoading audio from {audio_path}...")
251
+ audio = transcriber.load_audio(audio_path)
252
+ duration = len(audio) / transcriber.sample_rate
253
+ print(f"Audio duration: {duration:.2f}s")
254
+
255
+ print("\nTranscribing...")
256
+ start = time.time()
257
+ text, tokens, timestamps = transcriber.transcribe(audio)
258
+ total_time = time.time() - start
259
+
260
+ print(f"\n{'='*60}")
261
+ print(f"TRANSCRIPTION:")
262
+ print(f"{text}")
263
+ print(f"{'='*60}")
264
+ print(f"\nPerformance:")
265
+ print(f" Total time: {total_time:.3f}s")
266
+ print(f" Real-time factor: {total_time / duration:.2f}x")
267
+
268
+
269
+ if __name__ == '__main__':
270
+ main()
@@ -0,0 +1,89 @@
1
+ ---
2
+ name: parakeet
3
+ description: Parakeet speech-to-text system. Provides fast CPU-based transcription using Parakeet TDT INT8 models. Use when checking transcription status or troubleshooting audio issues.
4
+ ---
5
+
6
+ # Parakeet STT
7
+
8
+ Fast CPU-based speech-to-text using NVIDIA's Parakeet TDT INT8 models.
9
+
10
+ ## Model Versions
11
+
12
+ | Version | Description | Languages |
13
+ |---------|-------------|-----------|
14
+ | **V2** | English optimized | English (higher accuracy) |
15
+ | **V3** | Multilingual | 25 European languages + auto-detect |
16
+
17
+ ## Install / Switch Model
18
+
19
+ ```bash
20
+ # Install V2 (English optimized - default)
21
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
22
+
23
+ # Install/switch to V3 (Multilingual)
24
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
25
+ ```
26
+
27
+ The install script:
28
+ - Downloads the pre-quantized INT8 model (~475MB)
29
+ - Sets up the Python virtual environment
30
+ - Creates a symlink at `~/.openclaw/tools/parakeet/model` pointing to the active model
31
+
32
+ ## Status Check
33
+
34
+ ```bash
35
+ openclaw parakeet:status
36
+ ```
37
+
38
+ ## How It Works
39
+
40
+ 1. Audio messages are automatically transcribed before reaching the agent
41
+ 2. First transcription loads the model (~3 seconds)
42
+ 3. Model stays loaded for subsequent transcriptions
43
+ 4. After 20 minutes of inactivity, model unloads to save memory
44
+
45
+ ## Model Selection
46
+
47
+ The daemon automatically selects the model:
48
+
49
+ 1. **Symlink** (`~/.openclaw/tools/parakeet/model`) - set by install.sh
50
+ 2. **Environment variable** `PARAKEET_MODEL_VERSION=v2` or `v3`
51
+ 3. **Auto-detect** - looks for model-v2, then model-v3 directories
52
+
53
+ ## Troubleshooting
54
+
55
+ ### Check if configured
56
+
57
+ Look at `tools.media.audio.models` in openclaw.json - it should point to the parakeet client script.
58
+
59
+ ### Check daemon status
60
+
61
+ ```bash
62
+ # Check if daemon socket exists
63
+ ls -la /tmp/parakeet-lazy.sock
64
+
65
+ # Watch logs
66
+ openclaw logs --follow | grep -i parakeet
67
+ ```
68
+
69
+ ### Model not found error
70
+
71
+ Run the install script:
72
+ ```bash
73
+ ~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
74
+ ```
75
+
76
+ ### Manual transcription test
77
+
78
+ ```bash
79
+ # Activate venv and test
80
+ source ~/.openclaw/tools/parakeet/venv/bin/activate
81
+ python ~/.openclaw/tools/parakeet/parakeet_transcribe.py path/to/audio.ogg
82
+ ```
83
+
84
+ ## Configuration
85
+
86
+ In `plugins.entries.parakeet-stt`:
87
+ - `enabled`: Enable/disable
88
+ - `modelVersion`: "v2" or "v3" (informational - actual switching via install.sh)
89
+ - `inactivityTimeoutMin`: Minutes before unloading (default: 20)