@nicfox77/parakeet-stt 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -0
- package/index.ts +83 -0
- package/openclaw.plugin.json +71 -0
- package/package.json +27 -0
- package/scripts/install.sh +137 -0
- package/scripts/parakeet-audio-client.py +73 -0
- package/scripts/parakeet-lazy-daemon.py +156 -0
- package/scripts/parakeet_transcribe.py +270 -0
- package/skills/parakeet/SKILL.md +89 -0
package/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Parakeet STT for OpenClaw
|
|
2
|
+
|
|
3
|
+
Fast CPU-based speech-to-text using NVIDIA's Parakeet TDT INT8 models.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **4x faster than real-time** (0.25x RTF)
|
|
8
|
+
- **CPU-only** - no GPU required
|
|
9
|
+
- **Two model versions:**
|
|
10
|
+
- **V2** - English optimized (higher accuracy for English)
|
|
11
|
+
- **V3** - Multilingual (25 European languages, auto-detect)
|
|
12
|
+
- **Lazy loading** - model loads on first transcription, unloads after inactivity
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### 1. Install the plugin
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
openclaw plugins install @nicfox77/parakeet-stt
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### 2. Install a model
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# English optimized (default)
|
|
26
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
|
|
27
|
+
|
|
28
|
+
# Or multilingual
|
|
29
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This downloads the pre-quantized INT8 model (~475MB) from the [Handy project](https://github.com/cjpais/Handy).
|
|
33
|
+
|
|
34
|
+
### 3. Configure OpenClaw
|
|
35
|
+
|
|
36
|
+
Add to your `openclaw.json`:
|
|
37
|
+
|
|
38
|
+
```json
|
|
39
|
+
{
|
|
40
|
+
"tools": {
|
|
41
|
+
"media": {
|
|
42
|
+
"audio": {
|
|
43
|
+
"enabled": true,
|
|
44
|
+
"models": [
|
|
45
|
+
{
|
|
46
|
+
"type": "cli",
|
|
47
|
+
"command": "/home/YOUR_USER/.openclaw/tools/parakeet/parakeet-audio-client.py",
|
|
48
|
+
"args": ["{{MediaPath}}", "{{OutputDir}}"]
|
|
49
|
+
}
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
"plugins": {
|
|
55
|
+
"entries": {
|
|
56
|
+
"parakeet-stt": {
|
|
57
|
+
"enabled": true,
|
|
58
|
+
"modelVersion": "v2"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Switching Models
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Switch to V2 (English)
|
|
69
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
|
|
70
|
+
|
|
71
|
+
# Switch to V3 (Multilingual)
|
|
72
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
The install script updates a symlink, so the daemon automatically uses the new model on next load.
|
|
76
|
+
|
|
77
|
+
## CLI Commands
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Check status
|
|
81
|
+
openclaw parakeet:status
|
|
82
|
+
|
|
83
|
+
# Install model
|
|
84
|
+
openclaw parakeet:install v2
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Requirements
|
|
88
|
+
|
|
89
|
+
- Python 3.8+
|
|
90
|
+
- ~500MB disk space for model
|
|
91
|
+
- ~500MB RAM when model loaded
|
|
92
|
+
|
|
93
|
+
## Credits
|
|
94
|
+
|
|
95
|
+
- Models: [NVIDIA Parakeet TDT](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
|
|
96
|
+
- INT8 Quantization: [Handy](https://github.com/cjpais/Handy) by cjpais
|
|
97
|
+
- ONNX Runtime for inference
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
MIT
|
package/index.ts
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parakeet STT Plugin for OpenClaw
|
|
3
|
+
*
|
|
4
|
+
* Provides fast CPU-based speech-to-text using Parakeet TDT INT8 models.
|
|
5
|
+
* Supports V2 (English optimized) and V3 (Multilingual) model selection.
|
|
6
|
+
*
|
|
7
|
+
* The actual transcription is configured via tools.media.audio.models in openclaw.json.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { Type } from "@sinclair/typebox";
|
|
11
|
+
|
|
12
|
+
export default function (api: any) {
|
|
13
|
+
api.logger.info("parakeet-stt: plugin loaded");
|
|
14
|
+
|
|
15
|
+
// Register a CLI command for checking Parakeet status
|
|
16
|
+
api.registerCommand({
|
|
17
|
+
name: "parakeet:status",
|
|
18
|
+
description: "Check Parakeet STT daemon status",
|
|
19
|
+
async handler() {
|
|
20
|
+
const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
|
|
21
|
+
const modelVersion = cfg.modelVersion || "v2";
|
|
22
|
+
const toolsDir = `${process.env.HOME}/.openclaw/tools/parakeet`;
|
|
23
|
+
const modelPath = cfg.modelPath || `${toolsDir}/model`;
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
modelVersion,
|
|
27
|
+
modelPath,
|
|
28
|
+
daemonPath: `${toolsDir}/parakeet-lazy-daemon.py`,
|
|
29
|
+
enabled: cfg.enabled !== false,
|
|
30
|
+
timeout: cfg.timeoutMs || 30000,
|
|
31
|
+
inactivityTimeout: (cfg.inactivityTimeoutMin || 20) + " minutes",
|
|
32
|
+
installCommand: `bash ~/.openclaw/extensions/parakeet-stt/scripts/install.sh ${modelVersion}`
|
|
33
|
+
};
|
|
34
|
+
},
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// Register a CLI command for installing the model
|
|
38
|
+
api.registerCommand({
|
|
39
|
+
name: "parakeet:install",
|
|
40
|
+
description: "Download and install a Parakeet TDT model",
|
|
41
|
+
async handler(args: { version?: string }) {
|
|
42
|
+
const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
|
|
43
|
+
const version = args?.version || cfg.modelVersion || "v2";
|
|
44
|
+
const installScript = `${process.env.HOME}/.openclaw/extensions/parakeet-stt/scripts/install.sh`;
|
|
45
|
+
api.logger.info?.(`parakeet-stt: install command called for ${version}`);
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
message: `Run the install script for ${version}:`,
|
|
49
|
+
command: `bash ${installScript} ${version}`,
|
|
50
|
+
hint: "v2 = English optimized, v3 = Multilingual (25 languages)"
|
|
51
|
+
};
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
// Register an agent tool for checking transcription status
|
|
56
|
+
api.registerTool(
|
|
57
|
+
{
|
|
58
|
+
name: "parakeet_status",
|
|
59
|
+
description: "Check the status of the Parakeet speech-to-text system",
|
|
60
|
+
parameters: Type.Object({}),
|
|
61
|
+
async execute() {
|
|
62
|
+
const cfg = api.config.plugins?.entries?.["parakeet-stt"] || {};
|
|
63
|
+
const modelVersion = cfg.modelVersion || "v2";
|
|
64
|
+
const modelPath = cfg.modelPath || `${process.env.HOME}/.openclaw/tools/parakeet/model`;
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
content: [
|
|
68
|
+
{
|
|
69
|
+
type: "text",
|
|
70
|
+
text: JSON.stringify({
|
|
71
|
+
enabled: cfg.enabled !== false,
|
|
72
|
+
modelVersion,
|
|
73
|
+
modelPath,
|
|
74
|
+
configured: !!cfg.enabled
|
|
75
|
+
}, null, 2)
|
|
76
|
+
}
|
|
77
|
+
]
|
|
78
|
+
};
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
{ optional: true }
|
|
82
|
+
);
|
|
83
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "parakeet-stt",
|
|
3
|
+
"name": "Parakeet STT",
|
|
4
|
+
"description": "Fast CPU-based speech-to-text using Parakeet TDT INT8 models. 4x faster than real-time. Supports V2 (English optimized) and V3 (multilingual).",
|
|
5
|
+
"version": "0.2.0",
|
|
6
|
+
"skills": ["skills/parakeet"],
|
|
7
|
+
"configSchema": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"additionalProperties": false,
|
|
10
|
+
"properties": {
|
|
11
|
+
"enabled": {
|
|
12
|
+
"type": "boolean"
|
|
13
|
+
},
|
|
14
|
+
"modelVersion": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"enum": ["v2", "v3"],
|
|
17
|
+
"default": "v2",
|
|
18
|
+
"description": "Model version: v2 for English (higher accuracy), v3 for multilingual (25 languages, auto-detect)"
|
|
19
|
+
},
|
|
20
|
+
"modelPath": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Path to the Parakeet ONNX model directory (auto-set based on modelVersion)"
|
|
23
|
+
},
|
|
24
|
+
"daemonPath": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Path to the parakeet daemon script"
|
|
27
|
+
},
|
|
28
|
+
"timeoutMs": {
|
|
29
|
+
"type": "integer",
|
|
30
|
+
"minimum": 1000,
|
|
31
|
+
"default": 30000,
|
|
32
|
+
"description": "Timeout for transcription requests"
|
|
33
|
+
},
|
|
34
|
+
"inactivityTimeoutMin": {
|
|
35
|
+
"type": "integer",
|
|
36
|
+
"minimum": 1,
|
|
37
|
+
"default": 20,
|
|
38
|
+
"description": "Minutes of inactivity before unloading the model"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"uiHints": {
|
|
43
|
+
"enabled": {
|
|
44
|
+
"label": "Enable Parakeet STT"
|
|
45
|
+
},
|
|
46
|
+
"modelVersion": {
|
|
47
|
+
"label": "Model Version",
|
|
48
|
+
"help": "v2 = English optimized (higher accuracy), v3 = Multilingual (25 European languages with auto-detect)"
|
|
49
|
+
},
|
|
50
|
+
"modelPath": {
|
|
51
|
+
"label": "Model Path",
|
|
52
|
+
"placeholder": "~/.openclaw/tools/parakeet/model",
|
|
53
|
+
"help": "Directory containing the Parakeet ONNX model files",
|
|
54
|
+
"advanced": true
|
|
55
|
+
},
|
|
56
|
+
"daemonPath": {
|
|
57
|
+
"label": "Daemon Script Path",
|
|
58
|
+
"placeholder": "~/.openclaw/tools/parakeet/parakeet-lazy-daemon.py",
|
|
59
|
+
"advanced": true
|
|
60
|
+
},
|
|
61
|
+
"timeoutMs": {
|
|
62
|
+
"label": "Request Timeout (ms)",
|
|
63
|
+
"advanced": true
|
|
64
|
+
},
|
|
65
|
+
"inactivityTimeoutMin": {
|
|
66
|
+
"label": "Inactivity Timeout (minutes)",
|
|
67
|
+
"help": "How long before the model unloads to save memory",
|
|
68
|
+
"advanced": true
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@nicfox77/parakeet-stt",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Parakeet TDT INT8 speech-to-text plugin for OpenClaw. Supports V2 (English) and V3 (Multilingual) models.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "git+https://github.com/Nicfox77/openclaw-parakeet-stt.git"
|
|
9
|
+
},
|
|
10
|
+
"author": "Nicfox77",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"keywords": [
|
|
13
|
+
"openclaw",
|
|
14
|
+
"parakeet",
|
|
15
|
+
"stt",
|
|
16
|
+
"speech-to-text",
|
|
17
|
+
"transcription",
|
|
18
|
+
"asr",
|
|
19
|
+
"nvidia"
|
|
20
|
+
],
|
|
21
|
+
"dependencies": {
|
|
22
|
+
"@sinclair/typebox": "0.34.48"
|
|
23
|
+
},
|
|
24
|
+
"openclaw": {
|
|
25
|
+
"extensions": ["./index.ts"]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Parakeet TDT INT8 Model Installer
|
|
3
|
+
# Downloads pre-quantized INT8 models from the Handy project
|
|
4
|
+
# https://github.com/cjpais/Handy
|
|
5
|
+
|
|
6
|
+
set -e
|
|
7
|
+
|
|
8
|
+
# Configuration
|
|
9
|
+
PARAKEET_DIR="${PARAKEET_DIR:-$HOME/.openclaw/tools/parakeet}"
|
|
10
|
+
VENV_DIR="$PARAKEET_DIR/.venv"
|
|
11
|
+
|
|
12
|
+
# Model URLs (from Handy project)
|
|
13
|
+
MODEL_URLS_V2="https://blob.handy.computer/parakeet-v2-int8.tar.gz"
|
|
14
|
+
MODEL_URLS_V3="https://blob.handy.computer/parakeet-v3-int8.tar.gz"
|
|
15
|
+
|
|
16
|
+
# Default to V2 (English optimized)
|
|
17
|
+
VERSION="${1:-v2}"
|
|
18
|
+
|
|
19
|
+
# Validate version
|
|
20
|
+
if [[ "$VERSION" != "v2" && "$VERSION" != "v3" ]]; then
|
|
21
|
+
echo "Usage: $0 [v2|v3]"
|
|
22
|
+
echo " v2 - English optimized (higher accuracy for English)"
|
|
23
|
+
echo " v3 - Multilingual (25 European languages, auto-detect)"
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Select URL based on version
|
|
28
|
+
if [[ "$VERSION" == "v2" ]]; then
|
|
29
|
+
MODEL_URL="$MODEL_URLS_V2"
|
|
30
|
+
MODEL_DIR="$PARAKEET_DIR/model-v2"
|
|
31
|
+
MODEL_SIZE="473MB"
|
|
32
|
+
MODEL_DESC="English optimized"
|
|
33
|
+
else
|
|
34
|
+
MODEL_URL="$MODEL_URLS_V3"
|
|
35
|
+
MODEL_DIR="$PARAKEET_DIR/model-v3"
|
|
36
|
+
MODEL_SIZE="478MB"
|
|
37
|
+
MODEL_DESC="Multilingual (25 languages)"
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
# Create symlink to active model
|
|
41
|
+
ACTIVE_MODEL_LINK="$PARAKEET_DIR/model"
|
|
42
|
+
|
|
43
|
+
echo "=== Parakeet TDT $VERSION INT8 Installer ==="
|
|
44
|
+
echo "Model: $MODEL_DESC"
|
|
45
|
+
echo ""
|
|
46
|
+
|
|
47
|
+
# Check Python
|
|
48
|
+
if ! command -v python3 &> /dev/null; then
|
|
49
|
+
echo "Error: Python 3 is required"
|
|
50
|
+
exit 1
|
|
51
|
+
fi
|
|
52
|
+
|
|
53
|
+
echo "Python version: $(python3 --version)"
|
|
54
|
+
|
|
55
|
+
# Create directories
|
|
56
|
+
mkdir -p "$PARAKEET_DIR"
|
|
57
|
+
|
|
58
|
+
# Create virtual environment (reuse existing if present)
|
|
59
|
+
if [ ! -d "$VENV_DIR" ]; then
|
|
60
|
+
echo "Creating Python virtual environment..."
|
|
61
|
+
python3 -m venv "$VENV_DIR"
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
# Activate venv
|
|
65
|
+
source "$VENV_DIR/bin/activate"
|
|
66
|
+
|
|
67
|
+
# Install minimal dependencies (Handy models just need onnxruntime + librosa)
|
|
68
|
+
echo "Installing Python dependencies..."
|
|
69
|
+
pip install --upgrade pip
|
|
70
|
+
pip install onnxruntime librosa soundfile
|
|
71
|
+
|
|
72
|
+
# Download and extract model if not present
|
|
73
|
+
if [ ! -d "$MODEL_DIR" ] || [ ! -f "$MODEL_DIR/model.onnx" ]; then
|
|
74
|
+
echo ""
|
|
75
|
+
echo "Downloading Parakeet TDT $VERSION model (~$MODEL_SIZE)..."
|
|
76
|
+
echo "URL: $MODEL_URL"
|
|
77
|
+
|
|
78
|
+
TMP_TAR="/tmp/parakeet-$VERSION-int8.tar.gz"
|
|
79
|
+
|
|
80
|
+
if command -v wget &> /dev/null; then
|
|
81
|
+
wget -O "$TMP_TAR" "$MODEL_URL"
|
|
82
|
+
elif command -v curl &> /dev/null; then
|
|
83
|
+
curl -L -o "$TMP_TAR" "$MODEL_URL"
|
|
84
|
+
else
|
|
85
|
+
echo "Error: wget or curl required for download"
|
|
86
|
+
exit 1
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
echo "Extracting model..."
|
|
90
|
+
mkdir -p "$MODEL_DIR"
|
|
91
|
+
tar -xzf "$TMP_TAR" -C "$MODEL_DIR" --strip-components=1 2>/dev/null || {
|
|
92
|
+
# If strip-components fails, try without it
|
|
93
|
+
tar -xzf "$TMP_TAR" -C "$MODEL_DIR"
|
|
94
|
+
# Move files from subdirectory if needed
|
|
95
|
+
for subdir in "$MODEL_DIR"/*/; do
|
|
96
|
+
if [ -d "$subdir" ]; then
|
|
97
|
+
mv "$subdir"* "$MODEL_DIR/" 2>/dev/null || true
|
|
98
|
+
rmdir "$subdir" 2>/dev/null || true
|
|
99
|
+
fi
|
|
100
|
+
done
|
|
101
|
+
}
|
|
102
|
+
rm -f "$TMP_TAR"
|
|
103
|
+
|
|
104
|
+
echo "Model downloaded and extracted successfully"
|
|
105
|
+
else
|
|
106
|
+
echo "Model already exists at $MODEL_DIR"
|
|
107
|
+
fi
|
|
108
|
+
|
|
109
|
+
# Update symlink to active model
|
|
110
|
+
rm -f "$ACTIVE_MODEL_LINK"
|
|
111
|
+
ln -s "$MODEL_DIR" "$ACTIVE_MODEL_LINK"
|
|
112
|
+
echo "Active model symlink: $ACTIVE_MODEL_LINK -> $MODEL_DIR"
|
|
113
|
+
|
|
114
|
+
# Copy scripts from extension to tools directory
|
|
115
|
+
SCRIPTS_SRC="$HOME/.openclaw/extensions/parakeet-stt/scripts"
|
|
116
|
+
for script in parakeet-lazy-daemon.py parakeet-audio-client.py parakeet_transcribe.py; do
|
|
117
|
+
if [ -f "$SCRIPTS_SRC/$script" ]; then
|
|
118
|
+
cp "$SCRIPTS_SRC/$script" "$PARAKEET_DIR/"
|
|
119
|
+
chmod +x "$PARAKEET_DIR/$script"
|
|
120
|
+
echo "Copied $script"
|
|
121
|
+
fi
|
|
122
|
+
done
|
|
123
|
+
|
|
124
|
+
echo ""
|
|
125
|
+
echo "=== Installation Complete ==="
|
|
126
|
+
echo ""
|
|
127
|
+
echo "Version: $VERSION ($MODEL_DESC)"
|
|
128
|
+
echo "Model directory: $MODEL_DIR"
|
|
129
|
+
echo "Active model: $ACTIVE_MODEL_LINK"
|
|
130
|
+
echo "Virtual environment: $VENV_DIR"
|
|
131
|
+
echo ""
|
|
132
|
+
echo "To switch models, run:"
|
|
133
|
+
echo " $0 v2 # English optimized"
|
|
134
|
+
echo " $0 v3 # Multilingual"
|
|
135
|
+
echo ""
|
|
136
|
+
echo "OpenClaw config (already configured):"
|
|
137
|
+
echo ' tools.media.audio.models → parakeet-audio-client.py'
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
SOCKET_PATH = "/tmp/parakeet-lazy.sock"
|
|
10
|
+
DAEMON_PATH = os.path.expanduser("~/.openclaw/tools/parakeet/parakeet-lazy-daemon.py")
|
|
11
|
+
|
|
12
|
+
def ensure_daemon():
|
|
13
|
+
# Check if daemon socket exists and responsive
|
|
14
|
+
try:
|
|
15
|
+
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
|
16
|
+
s.settimeout(0.5)
|
|
17
|
+
s.connect(SOCKET_PATH)
|
|
18
|
+
return # daemon already running
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
# Start daemon in background
|
|
22
|
+
try:
|
|
23
|
+
subprocess.Popen(
|
|
24
|
+
[sys.executable, DAEMON_PATH],
|
|
25
|
+
stdout=subprocess.DEVNULL,
|
|
26
|
+
stderr=subprocess.DEVNULL,
|
|
27
|
+
start_new_session=True
|
|
28
|
+
)
|
|
29
|
+
time.sleep(1) # give it a moment to start
|
|
30
|
+
except Exception as e:
|
|
31
|
+
print(f"Failed to start daemon: {e}", file=sys.stderr)
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
def query_daemon(audio_path):
|
|
35
|
+
for attempt in range(3):
|
|
36
|
+
try:
|
|
37
|
+
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
|
38
|
+
s.connect(SOCKET_PATH)
|
|
39
|
+
request = {"action": "transcribe", "audio_path": audio_path}
|
|
40
|
+
s.sendall(json.dumps(request).encode() + b"\n")
|
|
41
|
+
response_data = b""
|
|
42
|
+
while True:
|
|
43
|
+
chunk = s.recv(4096)
|
|
44
|
+
if not chunk:
|
|
45
|
+
break
|
|
46
|
+
response_data += chunk
|
|
47
|
+
if b"\n" in chunk:
|
|
48
|
+
break
|
|
49
|
+
if response_data:
|
|
50
|
+
response = json.loads(response_data.strip())
|
|
51
|
+
if "text" in response:
|
|
52
|
+
print(response["text"])
|
|
53
|
+
return 0
|
|
54
|
+
else:
|
|
55
|
+
print(response.get("error", "Unknown error"), file=sys.stderr)
|
|
56
|
+
return 1
|
|
57
|
+
else:
|
|
58
|
+
time.sleep(0.5)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
if attempt == 2:
|
|
61
|
+
print(f"Daemon communication failed: {e}", file=sys.stderr)
|
|
62
|
+
return 1
|
|
63
|
+
time.sleep(0.5)
|
|
64
|
+
return 1
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
if len(sys.argv) < 2:
|
|
68
|
+
print("Usage: parakeet-audio-client.py <audio_path> [output_dir]", file=sys.stderr)
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
audio_path = sys.argv[1]
|
|
71
|
+
# output_dir = sys.argv[2] if len(sys.argv) > 2 else None
|
|
72
|
+
ensure_daemon()
|
|
73
|
+
sys.exit(query_daemon(audio_path))
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Parakeet Lazy Daemon - Loads model on demand, unloads after inactivity.
|
|
4
|
+
Supports V2 (English) and V3 (Multilingual) model selection.
|
|
5
|
+
"""
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import signal
|
|
9
|
+
import socket
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
# Add script directory to path to import ParakeetTDT
|
|
15
|
+
SCRIPT_DIR = Path(__file__).parent
|
|
16
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
17
|
+
|
|
18
|
+
# Try to import, will fail if model not installed yet
|
|
19
|
+
try:
|
|
20
|
+
from parakeet_transcribe import ParakeetTDT
|
|
21
|
+
except ImportError:
|
|
22
|
+
print("Error: parakeet_transcribe not found. Run install.sh first.", file=sys.stderr)
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
SOCKET_PATH = "/tmp/parakeet-lazy.sock"
|
|
26
|
+
IDLE_TIMEOUT = 20 * 60 # 20 minutes
|
|
27
|
+
|
|
28
|
+
def get_model_path():
|
|
29
|
+
"""Determine which model to use based on config/env."""
|
|
30
|
+
tools_dir = Path.home() / ".openclaw" / "tools" / "parakeet"
|
|
31
|
+
|
|
32
|
+
# 1. Check for explicit symlink (created by install.sh)
|
|
33
|
+
symlink = tools_dir / "model"
|
|
34
|
+
if symlink.is_symlink() or symlink.is_dir():
|
|
35
|
+
return symlink.resolve()
|
|
36
|
+
|
|
37
|
+
# 2. Check environment variable
|
|
38
|
+
model_version = os.environ.get("PARAKEET_MODEL_VERSION", "").lower()
|
|
39
|
+
if model_version in ("v2", "2"):
|
|
40
|
+
return tools_dir / "model-v2"
|
|
41
|
+
if model_version in ("v3", "3"):
|
|
42
|
+
return tools_dir / "model-v3"
|
|
43
|
+
|
|
44
|
+
# 3. Check for installed models, prefer v2 (English)
|
|
45
|
+
for version in ["v2", "v3"]:
|
|
46
|
+
model_dir = tools_dir / f"model-{version}"
|
|
47
|
+
if model_dir.is_dir():
|
|
48
|
+
return model_dir
|
|
49
|
+
|
|
50
|
+
# 4. Fallback to symlink path (will error if not installed)
|
|
51
|
+
return symlink
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ParakeetLazyDaemon:
|
|
55
|
+
def __init__(self):
|
|
56
|
+
self.model_dir = get_model_path()
|
|
57
|
+
self.transcriber = None
|
|
58
|
+
self.last_used = None
|
|
59
|
+
self.running = True
|
|
60
|
+
|
|
61
|
+
# Validate model exists
|
|
62
|
+
if not self.model_dir.is_dir():
|
|
63
|
+
print(f"Error: Model not found at {self.model_dir}", file=sys.stderr)
|
|
64
|
+
print("Run: ~/.openclaw/extensions/parakeet-stt/scripts/install.sh [v2|v3]", file=sys.stderr)
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
print(f"Using model: {self.model_dir}", file=sys.stderr)
|
|
68
|
+
|
|
69
|
+
# Clean up any existing socket
|
|
70
|
+
try:
|
|
71
|
+
os.unlink(SOCKET_PATH)
|
|
72
|
+
except OSError:
|
|
73
|
+
if os.path.exists(SOCKET_PATH):
|
|
74
|
+
raise
|
|
75
|
+
|
|
76
|
+
self.server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
77
|
+
self.server.bind(SOCKET_PATH)
|
|
78
|
+
os.chmod(SOCKET_PATH, 0o666) # world readable/writable
|
|
79
|
+
self.server.listen(1)
|
|
80
|
+
signal.signal(signal.SIGTERM, self._handle_signal)
|
|
81
|
+
signal.signal(signal.SIGINT, self._handle_signal)
|
|
82
|
+
|
|
83
|
+
def _handle_signal(self, signum, frame):
|
|
84
|
+
self.running = False
|
|
85
|
+
|
|
86
|
+
def ensure_loaded(self):
|
|
87
|
+
if self.transcriber is None:
|
|
88
|
+
print("Loading Parakeet model...", file=sys.stderr)
|
|
89
|
+
self.transcriber = ParakeetTDT(str(self.model_dir))
|
|
90
|
+
print("Model loaded.", file=sys.stderr)
|
|
91
|
+
self.last_used = time.time()
|
|
92
|
+
|
|
93
|
+
def unload_if_idle(self):
|
|
94
|
+
if self.transcriber is None or self.last_used is None:
|
|
95
|
+
return
|
|
96
|
+
idle = time.time() - self.last_used
|
|
97
|
+
if idle > IDLE_TIMEOUT:
|
|
98
|
+
print(f"Unloading model (idle {idle:.1f}s)", file=sys.stderr)
|
|
99
|
+
self.transcriber = None
|
|
100
|
+
import gc
|
|
101
|
+
gc.collect()
|
|
102
|
+
|
|
103
|
+
def handle_connection(self, conn):
|
|
104
|
+
response = None
|
|
105
|
+
try:
|
|
106
|
+
data = b""
|
|
107
|
+
while True:
|
|
108
|
+
chunk = conn.recv(4096)
|
|
109
|
+
if not chunk:
|
|
110
|
+
break
|
|
111
|
+
data += chunk
|
|
112
|
+
if b"\n" in chunk:
|
|
113
|
+
break
|
|
114
|
+
if not data:
|
|
115
|
+
response = {"error": "Empty request"}
|
|
116
|
+
else:
|
|
117
|
+
line = data.split(b"\n", 1)[0].strip()
|
|
118
|
+
request = json.loads(line.decode())
|
|
119
|
+
action = request.get("action")
|
|
120
|
+
if action != "transcribe":
|
|
121
|
+
response = {"error": f"Unsupported action: {action}"}
|
|
122
|
+
else:
|
|
123
|
+
audio_path = request["audio_path"]
|
|
124
|
+
self.ensure_loaded()
|
|
125
|
+
audio = self.transcriber.load_audio(audio_path)
|
|
126
|
+
text, tokens, timestamps = self.transcriber.transcribe(audio)
|
|
127
|
+
self.last_used = time.time()
|
|
128
|
+
response = {"text": text, "tokens": tokens, "timestamps": timestamps}
|
|
129
|
+
except Exception as e:
|
|
130
|
+
response = {"error": str(e)}
|
|
131
|
+
finally:
|
|
132
|
+
if response is not None:
|
|
133
|
+
conn.sendall(json.dumps(response).encode() + b"\n")
|
|
134
|
+
conn.close()
|
|
135
|
+
self.unload_if_idle()
|
|
136
|
+
|
|
137
|
+
def run(self):
|
|
138
|
+
print("ParakeetLazyDaemon listening on", SOCKET_PATH, file=sys.stderr)
|
|
139
|
+
while self.running:
|
|
140
|
+
try:
|
|
141
|
+
conn, addr = self.server.accept()
|
|
142
|
+
self.handle_connection(conn)
|
|
143
|
+
except socket.timeout:
|
|
144
|
+
continue
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print("Daemon error:", e, file=sys.stderr)
|
|
147
|
+
continue # Keep serving instead of breaking
|
|
148
|
+
self.server.close()
|
|
149
|
+
try:
|
|
150
|
+
os.unlink(SOCKET_PATH)
|
|
151
|
+
except OSError:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
daemon = ParakeetLazyDaemon()
|
|
156
|
+
daemon.run()
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Parakeet TDT V2 INT8 Transcription Script
|
|
4
|
+
Based on transcribe-rs implementation: https://github.com/cjpais/transcribe-rs
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
import re
|
|
11
|
+
import numpy as np
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import onnxruntime as ort
|
|
15
|
+
import librosa
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ParakeetTDT:
|
|
19
|
+
"""Parakeet TDT V2 INT8 transcriber using ONNX Runtime."""
|
|
20
|
+
|
|
21
|
+
# Constants from transcribe-rs
|
|
22
|
+
SUBSAMPLING_FACTOR = 8
|
|
23
|
+
WINDOW_SIZE = 0.01
|
|
24
|
+
MAX_TOKENS_PER_STEP = 10
|
|
25
|
+
|
|
26
|
+
def __init__(self, model_dir: str):
|
|
27
|
+
model_dir = Path(model_dir)
|
|
28
|
+
|
|
29
|
+
# Load config
|
|
30
|
+
config_path = model_dir / "config.json"
|
|
31
|
+
with open(config_path) as f:
|
|
32
|
+
self.config = json.load(f)
|
|
33
|
+
|
|
34
|
+
# Load vocabulary
|
|
35
|
+
self.vocab, self.blank_idx = self._load_vocab(model_dir / "vocab.txt")
|
|
36
|
+
self.vocab_size = len(self.vocab)
|
|
37
|
+
print(f"Loaded vocabulary: {self.vocab_size} tokens, blank_idx={self.blank_idx}")
|
|
38
|
+
|
|
39
|
+
self.sample_rate = 16000
|
|
40
|
+
|
|
41
|
+
# Create ONNX sessions
|
|
42
|
+
sess_options = ort.SessionOptions()
|
|
43
|
+
sess_options.intra_op_num_threads = 4
|
|
44
|
+
sess_options.inter_op_num_threads = 4
|
|
45
|
+
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
46
|
+
|
|
47
|
+
print(f"Loading preprocessor...")
|
|
48
|
+
start = time.time()
|
|
49
|
+
self.preprocessor = ort.InferenceSession(
|
|
50
|
+
str(model_dir / "nemo128.onnx"), sess_options)
|
|
51
|
+
print(f"Preprocessor loaded in {time.time() - start:.2f}s")
|
|
52
|
+
|
|
53
|
+
print(f"Loading encoder...")
|
|
54
|
+
start = time.time()
|
|
55
|
+
self.encoder = ort.InferenceSession(
|
|
56
|
+
str(model_dir / "encoder-model.int8.onnx"), sess_options)
|
|
57
|
+
print(f"Encoder loaded in {time.time() - start:.2f}s")
|
|
58
|
+
|
|
59
|
+
print(f"Loading decoder...")
|
|
60
|
+
start = time.time()
|
|
61
|
+
self.decoder = ort.InferenceSession(
|
|
62
|
+
str(model_dir / "decoder_joint-model.int8.onnx"), sess_options)
|
|
63
|
+
print(f"Decoder loaded in {time.time() - start:.2f}s")
|
|
64
|
+
|
|
65
|
+
def _load_vocab(self, vocab_path: Path) -> tuple:
|
|
66
|
+
"""Load vocabulary from vocab.txt file."""
|
|
67
|
+
vocab = {}
|
|
68
|
+
blank_idx = 0
|
|
69
|
+
|
|
70
|
+
with open(vocab_path) as f:
|
|
71
|
+
for line in f:
|
|
72
|
+
parts = line.rstrip().split(' ')
|
|
73
|
+
if len(parts) >= 2:
|
|
74
|
+
token = parts[0]
|
|
75
|
+
# Replace SentencePiece space marker with actual space
|
|
76
|
+
token = token.replace('\u2581', ' ')
|
|
77
|
+
idx = int(parts[1])
|
|
78
|
+
vocab[idx] = token
|
|
79
|
+
if token.strip() == '<blk>':
|
|
80
|
+
blank_idx = idx
|
|
81
|
+
|
|
82
|
+
return vocab, blank_idx
|
|
83
|
+
|
|
84
|
+
def load_audio(self, audio_path: str) -> np.ndarray:
|
|
85
|
+
"""Load audio file and convert to 16kHz mono float32."""
|
|
86
|
+
audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
|
|
87
|
+
return audio.astype(np.float32)
|
|
88
|
+
|
|
89
|
+
def transcribe(self, audio: np.ndarray) -> tuple:
|
|
90
|
+
"""Transcribe audio using Parakeet TDT model."""
|
|
91
|
+
# Prepare audio
|
|
92
|
+
waveforms = audio.reshape(1, -1)
|
|
93
|
+
waveforms_lens = np.array([audio.shape[0]], dtype=np.int64)
|
|
94
|
+
|
|
95
|
+
# Preprocess (mel spectrogram)
|
|
96
|
+
start = time.time()
|
|
97
|
+
prep_out = self.preprocessor.run(
|
|
98
|
+
None,
|
|
99
|
+
{'waveforms': waveforms, 'waveforms_lens': waveforms_lens}
|
|
100
|
+
)
|
|
101
|
+
features, features_lens = prep_out[0], prep_out[1]
|
|
102
|
+
print(f"Preprocessor: {time.time() - start:.3f}s, features shape: {features.shape}")
|
|
103
|
+
|
|
104
|
+
# Encode
|
|
105
|
+
start = time.time()
|
|
106
|
+
enc_out = self.encoder.run(
|
|
107
|
+
None,
|
|
108
|
+
{'audio_signal': features, 'length': features_lens}
|
|
109
|
+
)
|
|
110
|
+
encoder_out = enc_out[0] # [1, 1024, time]
|
|
111
|
+
encoder_out_lens = enc_out[1]
|
|
112
|
+
# Transpose to [1, time, 1024] like transcribe-rs
|
|
113
|
+
encoder_out = encoder_out.transpose(0, 2, 1)
|
|
114
|
+
print(f"Encoder: {time.time() - start:.3f}s, encoded shape: {encoder_out.shape}")
|
|
115
|
+
|
|
116
|
+
# Decode
|
|
117
|
+
start = time.time()
|
|
118
|
+
tokens, timestamps = self._decode_sequence(
|
|
119
|
+
encoder_out[0], int(encoder_out_lens[0]))
|
|
120
|
+
decode_time = time.time() - start
|
|
121
|
+
print(f"Decode: {decode_time:.3f}s")
|
|
122
|
+
|
|
123
|
+
# Convert to text
|
|
124
|
+
text = self._decode_tokens(tokens)
|
|
125
|
+
|
|
126
|
+
return text, tokens, timestamps
|
|
127
|
+
|
|
128
|
+
def _create_decoder_state(self) -> tuple:
|
|
129
|
+
"""Create initial decoder state (LSTM hidden states)."""
|
|
130
|
+
# Shape: [2, 1, 640] for batch_size=1
|
|
131
|
+
state1 = np.zeros((2, 1, 640), dtype=np.float32)
|
|
132
|
+
state2 = np.zeros((2, 1, 640), dtype=np.float32)
|
|
133
|
+
return state1, state2
|
|
134
|
+
|
|
135
|
+
def _decode_step(self, prev_tokens: list, prev_state: tuple,
|
|
136
|
+
encoder_step: np.ndarray) -> tuple:
|
|
137
|
+
"""Run one decoder step.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
prev_tokens: Previously emitted tokens
|
|
141
|
+
prev_state: Previous decoder state (state1, state2)
|
|
142
|
+
encoder_step: Encoder output for current frame [1024]
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
(logits, new_state)
|
|
146
|
+
"""
|
|
147
|
+
# Get last token or blank if empty
|
|
148
|
+
target_token = prev_tokens[-1] if prev_tokens else self.blank_idx
|
|
149
|
+
|
|
150
|
+
# Prepare inputs
|
|
151
|
+
# encoder_outputs: [1, 1024, 1] (batch, dim, time) - matches ONNX input shape
|
|
152
|
+
encoder_outputs = encoder_step.reshape(1, -1, 1).astype(np.float32)
|
|
153
|
+
targets = np.array([[target_token]], dtype=np.int32)
|
|
154
|
+
target_length = np.array([1], dtype=np.int32) # Must be int32 for this model
|
|
155
|
+
state1, state2 = prev_state
|
|
156
|
+
|
|
157
|
+
outputs = self.decoder.run(
|
|
158
|
+
None,
|
|
159
|
+
{
|
|
160
|
+
'encoder_outputs': encoder_outputs,
|
|
161
|
+
'targets': targets,
|
|
162
|
+
'target_length': target_length,
|
|
163
|
+
'input_states_1': state1,
|
|
164
|
+
'input_states_2': state2,
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
logits = outputs[0] # [1, 1, vocab_size + duration]
|
|
169
|
+
new_state1 = outputs[2]
|
|
170
|
+
new_state2 = outputs[3]
|
|
171
|
+
|
|
172
|
+
return logits[0, 0], (new_state1, new_state2)
|
|
173
|
+
|
|
174
|
+
def _decode_sequence(self, encodings: np.ndarray,
|
|
175
|
+
encodings_len: int) -> tuple:
|
|
176
|
+
"""Decode encoded sequence using greedy algorithm.
|
|
177
|
+
|
|
178
|
+
Implements TDT decoding with MAX_TOKENS_PER_STEP limit.
|
|
179
|
+
"""
|
|
180
|
+
prev_state = self._create_decoder_state()
|
|
181
|
+
tokens = []
|
|
182
|
+
timestamps = []
|
|
183
|
+
|
|
184
|
+
t = 0
|
|
185
|
+
emitted_tokens = 0
|
|
186
|
+
|
|
187
|
+
while t < encodings_len:
|
|
188
|
+
encoder_step = encodings[t] # [1024]
|
|
189
|
+
logits, new_state = self._decode_step(tokens, prev_state, encoder_step)
|
|
190
|
+
|
|
191
|
+
# For TDT: split into vocab logits and duration logits
|
|
192
|
+
if len(logits) > self.vocab_size:
|
|
193
|
+
vocab_logits = logits[:self.vocab_size]
|
|
194
|
+
# Duration logits not used in basic greedy decoding
|
|
195
|
+
else:
|
|
196
|
+
vocab_logits = logits
|
|
197
|
+
|
|
198
|
+
# Get argmax token
|
|
199
|
+
token = int(np.argmax(vocab_logits))
|
|
200
|
+
|
|
201
|
+
# Process non-blank token
|
|
202
|
+
if token != self.blank_idx:
|
|
203
|
+
prev_state = new_state
|
|
204
|
+
tokens.append(token)
|
|
205
|
+
timestamps.append(t)
|
|
206
|
+
emitted_tokens += 1
|
|
207
|
+
|
|
208
|
+
# Advance frame on blank OR after max tokens per step
|
|
209
|
+
if token == self.blank_idx or emitted_tokens == self.MAX_TOKENS_PER_STEP:
|
|
210
|
+
t += 1
|
|
211
|
+
emitted_tokens = 0
|
|
212
|
+
|
|
213
|
+
return tokens, timestamps
|
|
214
|
+
|
|
215
|
+
def _decode_tokens(self, ids: list) -> str:
|
|
216
|
+
"""Convert token IDs to text."""
|
|
217
|
+
tokens = []
|
|
218
|
+
for token_id in ids:
|
|
219
|
+
if token_id < len(self.vocab):
|
|
220
|
+
token = self.vocab[token_id]
|
|
221
|
+
# SentencePiece uses '▁' (U+2581) to mark word starts
|
|
222
|
+
# Replace with space for proper word separation
|
|
223
|
+
if token.startswith(' '):
|
|
224
|
+
tokens.append(token) # Already has leading space
|
|
225
|
+
else:
|
|
226
|
+
tokens.append(token)
|
|
227
|
+
|
|
228
|
+
# Join all tokens - spaces are already embedded
|
|
229
|
+
text = ''.join(tokens)
|
|
230
|
+
|
|
231
|
+
# Clean up multiple spaces
|
|
232
|
+
text = re.sub(r' +', ' ', text)
|
|
233
|
+
|
|
234
|
+
return text.strip()
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def main():
|
|
238
|
+
parser = argparse.ArgumentParser(description='Parakeet TDT V2 INT8 Transcription')
|
|
239
|
+
parser.add_argument('audio', help='Path to audio file')
|
|
240
|
+
parser.add_argument('--model', default='~/.openclaw/models/parakeet-tdt-0.6b-v2-int8',
|
|
241
|
+
help='Path to model directory')
|
|
242
|
+
args = parser.parse_args()
|
|
243
|
+
|
|
244
|
+
model_path = Path(args.model).expanduser()
|
|
245
|
+
audio_path = args.audio
|
|
246
|
+
|
|
247
|
+
print(f"Loading model from {model_path}...")
|
|
248
|
+
transcriber = ParakeetTDT(model_path)
|
|
249
|
+
|
|
250
|
+
print(f"\nLoading audio from {audio_path}...")
|
|
251
|
+
audio = transcriber.load_audio(audio_path)
|
|
252
|
+
duration = len(audio) / transcriber.sample_rate
|
|
253
|
+
print(f"Audio duration: {duration:.2f}s")
|
|
254
|
+
|
|
255
|
+
print("\nTranscribing...")
|
|
256
|
+
start = time.time()
|
|
257
|
+
text, tokens, timestamps = transcriber.transcribe(audio)
|
|
258
|
+
total_time = time.time() - start
|
|
259
|
+
|
|
260
|
+
print(f"\n{'='*60}")
|
|
261
|
+
print(f"TRANSCRIPTION:")
|
|
262
|
+
print(f"{text}")
|
|
263
|
+
print(f"{'='*60}")
|
|
264
|
+
print(f"\nPerformance:")
|
|
265
|
+
print(f" Total time: {total_time:.3f}s")
|
|
266
|
+
print(f" Real-time factor: {total_time / duration:.2f}x")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == '__main__':
|
|
270
|
+
main()
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: parakeet
|
|
3
|
+
description: Parakeet speech-to-text system. Provides fast CPU-based transcription using Parakeet TDT INT8 models. Use when checking transcription status or troubleshooting audio issues.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Parakeet STT
|
|
7
|
+
|
|
8
|
+
Fast CPU-based speech-to-text using NVIDIA's Parakeet TDT INT8 models.
|
|
9
|
+
|
|
10
|
+
## Model Versions
|
|
11
|
+
|
|
12
|
+
| Version | Description | Languages |
|
|
13
|
+
|---------|-------------|-----------|
|
|
14
|
+
| **V2** | English optimized | English (higher accuracy) |
|
|
15
|
+
| **V3** | Multilingual | 25 European languages + auto-detect |
|
|
16
|
+
|
|
17
|
+
## Install / Switch Model
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Install V2 (English optimized - default)
|
|
21
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
|
|
22
|
+
|
|
23
|
+
# Install/switch to V3 (Multilingual)
|
|
24
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v3
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
The install script:
|
|
28
|
+
- Downloads the pre-quantized INT8 model (~475MB)
|
|
29
|
+
- Sets up the Python virtual environment
|
|
30
|
+
- Creates a symlink at `~/.openclaw/tools/parakeet/model` pointing to the active model
|
|
31
|
+
|
|
32
|
+
## Status Check
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
openclaw parakeet:status
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## How It Works
|
|
39
|
+
|
|
40
|
+
1. Audio messages are automatically transcribed before reaching the agent
|
|
41
|
+
2. First transcription loads the model (~3 seconds)
|
|
42
|
+
3. Model stays loaded for subsequent transcriptions
|
|
43
|
+
4. After 20 minutes of inactivity, model unloads to save memory
|
|
44
|
+
|
|
45
|
+
## Model Selection
|
|
46
|
+
|
|
47
|
+
The daemon automatically selects the model:
|
|
48
|
+
|
|
49
|
+
1. **Symlink** (`~/.openclaw/tools/parakeet/model`) - set by install.sh
|
|
50
|
+
2. **Environment variable** `PARAKEET_MODEL_VERSION=v2` or `v3`
|
|
51
|
+
3. **Auto-detect** - looks for model-v2, then model-v3 directories
|
|
52
|
+
|
|
53
|
+
## Troubleshooting
|
|
54
|
+
|
|
55
|
+
### Check if configured
|
|
56
|
+
|
|
57
|
+
Look at `tools.media.audio.models` in openclaw.json - it should point to the parakeet client script.
|
|
58
|
+
|
|
59
|
+
### Check daemon status
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Check if daemon socket exists
|
|
63
|
+
ls -la /tmp/parakeet-lazy.sock
|
|
64
|
+
|
|
65
|
+
# Watch logs
|
|
66
|
+
openclaw logs --follow | grep -i parakeet
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Model not found error
|
|
70
|
+
|
|
71
|
+
Run the install script:
|
|
72
|
+
```bash
|
|
73
|
+
~/.openclaw/extensions/parakeet-stt/scripts/install.sh v2
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Manual transcription test
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Activate venv and test
|
|
80
|
+
source ~/.openclaw/tools/parakeet/venv/bin/activate
|
|
81
|
+
python ~/.openclaw/tools/parakeet/parakeet_transcribe.py path/to/audio.ogg
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Configuration
|
|
85
|
+
|
|
86
|
+
In `plugins.entries.parakeet-stt`:
|
|
87
|
+
- `enabled`: Enable/disable
|
|
88
|
+
- `modelVersion`: "v2" or "v3" (informational - actual switching via install.sh)
|
|
89
|
+
- `inactivityTimeoutMin`: Minutes before unloading (default: 20)
|