@argo-video/cli 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +12 -3
- package/dist/cli.js.map +1 -1
- package/dist/init.d.ts +15 -0
- package/dist/init.d.ts.map +1 -1
- package/dist/init.js +55 -2
- package/dist/init.js.map +1 -1
- package/dist/parse-playwright.d.ts +49 -0
- package/dist/parse-playwright.d.ts.map +1 -0
- package/dist/parse-playwright.js +265 -0
- package/dist/parse-playwright.js.map +1 -0
- package/dist/tts/engines/mlx-audio.d.ts +31 -0
- package/dist/tts/engines/mlx-audio.d.ts.map +1 -1
- package/dist/tts/engines/mlx-audio.js +47 -5
- package/dist/tts/engines/mlx-audio.js.map +1 -1
- package/dist/tts/engines/sarvam.d.ts.map +1 -1
- package/dist/tts/engines/sarvam.js +20 -23
- package/dist/tts/engines/sarvam.js.map +1 -1
- package/package.json +9 -2
- package/scripts/generate_logo_thumbnail.py +174 -0
- package/scripts/record-voice-ref.sh +87 -0
- package/scripts/setup-mlx-audio.sh +66 -0
- package/scripts/voice-clone-preview.sh +193 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Quick voice cloning preview — send voiceover text to mlx-audio server
|
|
4
|
+
# and get back individual clips + an optional joined clip.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ./scripts/voice-clone-preview.sh \
|
|
8
|
+
# --ref-audio ./assets/ref-voice.wav \
|
|
9
|
+
# --ref-text "Hi, my name is Shreyas. I build developer tools." \
|
|
10
|
+
# --voiceover demos/showcase.voiceover.json
|
|
11
|
+
#
|
|
12
|
+
# # Single line of text (no manifest):
|
|
13
|
+
# ./scripts/voice-clone-preview.sh \
|
|
14
|
+
# --ref-audio ./assets/ref-voice.wav \
|
|
15
|
+
# --ref-text "Hi, my name is Shreyas." \
|
|
16
|
+
# --text "Welcome to the demo."
|
|
17
|
+
#
|
|
18
|
+
# Options:
|
|
19
|
+
# --ref-audio PATH Reference voice WAV (required)
|
|
20
|
+
# --ref-text TEXT Transcript of reference audio (required)
|
|
21
|
+
# --voiceover PATH Voiceover JSON manifest (array of {scene, text, speed?, voice?})
|
|
22
|
+
# --text TEXT Single text to synthesize (alternative to --voiceover)
|
|
23
|
+
# --model ID Model ID (default: mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16)
|
|
24
|
+
# --server URL Server URL (default: http://localhost:8000)
|
|
25
|
+
# --out-dir PATH Output directory (default: ./voice-preview)
|
|
26
|
+
# --join Also produce a single joined clip
|
|
27
|
+
# --play Play the output when done (requires ffplay)
|
|
28
|
+
# --voice NAME Default voice (default: af_heart)
|
|
29
|
+
#
|
|
30
|
+
set -euo pipefail
|
|
31
|
+
|
|
32
|
+
# Defaults
|
|
33
|
+
MODEL="mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16"
|
|
34
|
+
SERVER="http://localhost:8000"
|
|
35
|
+
OUT_DIR="./voice-preview"
|
|
36
|
+
REF_AUDIO=""
|
|
37
|
+
REF_TEXT=""
|
|
38
|
+
VOICEOVER=""
|
|
39
|
+
SINGLE_TEXT=""
|
|
40
|
+
JOIN=false
|
|
41
|
+
PLAY=false
|
|
42
|
+
VOICE="af_heart"
|
|
43
|
+
|
|
44
|
+
while [[ $# -gt 0 ]]; do
|
|
45
|
+
case "$1" in
|
|
46
|
+
--ref-audio) REF_AUDIO="$2"; shift 2;;
|
|
47
|
+
--ref-text) REF_TEXT="$2"; shift 2;;
|
|
48
|
+
--voiceover) VOICEOVER="$2"; shift 2;;
|
|
49
|
+
--text) SINGLE_TEXT="$2"; shift 2;;
|
|
50
|
+
--model) MODEL="$2"; shift 2;;
|
|
51
|
+
--server) SERVER="$2"; shift 2;;
|
|
52
|
+
--out-dir) OUT_DIR="$2"; shift 2;;
|
|
53
|
+
--voice) VOICE="$2"; shift 2;;
|
|
54
|
+
--join) JOIN=true; shift;;
|
|
55
|
+
--play) PLAY=true; shift;;
|
|
56
|
+
-h|--help)
|
|
57
|
+
sed -n '2,/^set /{ /^#/s/^# \?//p }' "$0"
|
|
58
|
+
exit 0;;
|
|
59
|
+
*) echo "Unknown option: $1"; exit 1;;
|
|
60
|
+
esac
|
|
61
|
+
done
|
|
62
|
+
|
|
63
|
+
if [[ -z "$REF_AUDIO" || -z "$REF_TEXT" ]]; then
|
|
64
|
+
echo "Error: --ref-audio and --ref-text are required."
|
|
65
|
+
exit 1
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
if [[ -z "$VOICEOVER" && -z "$SINGLE_TEXT" ]]; then
|
|
69
|
+
echo "Error: provide --voiceover <manifest.json> or --text <string>."
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
72
|
+
|
|
73
|
+
# Check server is running
|
|
74
|
+
if ! curl -sf "$SERVER/v1/audio/speech" -o /dev/null -X POST \
|
|
75
|
+
-H "Content-Type: application/json" \
|
|
76
|
+
-d '{"model":"test","input":"test"}' 2>/dev/null; then
|
|
77
|
+
# It's OK if the request fails with a model error — server is up
|
|
78
|
+
if ! curl -sf --connect-timeout 3 "$SERVER" -o /dev/null 2>/dev/null && \
|
|
79
|
+
! curl -sf --connect-timeout 3 "$SERVER/docs" -o /dev/null 2>/dev/null; then
|
|
80
|
+
echo "Warning: mlx-audio server may not be running at $SERVER"
|
|
81
|
+
echo "Start it with: python3 -m mlx_audio.server --model $MODEL"
|
|
82
|
+
echo ""
|
|
83
|
+
fi
|
|
84
|
+
fi
|
|
85
|
+
|
|
86
|
+
mkdir -p "$OUT_DIR"
|
|
87
|
+
|
|
88
|
+
# Build the list of clips to generate
|
|
89
|
+
# Format: index|scene_name|text|speed|voice
|
|
90
|
+
CLIPS_LIST=$(mktemp)
|
|
91
|
+
trap 'rm -f "$CLIPS_LIST"' EXIT
|
|
92
|
+
|
|
93
|
+
if [[ -n "$SINGLE_TEXT" ]]; then
|
|
94
|
+
echo "0|single|$SINGLE_TEXT|1.0|$VOICE" > "$CLIPS_LIST"
|
|
95
|
+
else
|
|
96
|
+
# Parse voiceover JSON with python (available on macOS)
|
|
97
|
+
python3 -c "
|
|
98
|
+
import json, sys
|
|
99
|
+
with open('$VOICEOVER') as f:
|
|
100
|
+
scenes = json.load(f)
|
|
101
|
+
for i, s in enumerate(scenes):
|
|
102
|
+
scene = s.get('scene', f'scene-{i}')
|
|
103
|
+
text = s['text'].replace('|', ' ')
|
|
104
|
+
speed = s.get('speed', 1.0)
|
|
105
|
+
voice = s.get('voice', '$VOICE')
|
|
106
|
+
print(f'{i}|{scene}|{text}|{speed}|{voice}')
|
|
107
|
+
" > "$CLIPS_LIST"
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
TOTAL=$(wc -l < "$CLIPS_LIST" | tr -d ' ')
|
|
111
|
+
echo "Generating $TOTAL clip(s) via $SERVER"
|
|
112
|
+
echo "Model: $MODEL"
|
|
113
|
+
echo "Ref audio: $REF_AUDIO"
|
|
114
|
+
echo "Output: $OUT_DIR/"
|
|
115
|
+
echo ""
|
|
116
|
+
|
|
117
|
+
GENERATED_FILES=()
|
|
118
|
+
IDX=0
|
|
119
|
+
|
|
120
|
+
while IFS='|' read -r _ SCENE TEXT SPEED CLIP_VOICE; do
|
|
121
|
+
IDX=$((IDX + 1))
|
|
122
|
+
OUTFILE="$OUT_DIR/$(printf '%02d' "$IDX")-${SCENE}.wav"
|
|
123
|
+
|
|
124
|
+
printf " [%d/%d] %s ... " "$IDX" "$TOTAL" "$SCENE"
|
|
125
|
+
|
|
126
|
+
# Build JSON payload
|
|
127
|
+
PAYLOAD=$(python3 -c "
|
|
128
|
+
import json
|
|
129
|
+
p = {
|
|
130
|
+
'model': '$MODEL',
|
|
131
|
+
'input': $(python3 -c "import json; print(json.dumps('$TEXT'))"),
|
|
132
|
+
'voice': '$CLIP_VOICE',
|
|
133
|
+
'speed': $SPEED,
|
|
134
|
+
'ref_audio': '$REF_AUDIO',
|
|
135
|
+
'ref_text': $(python3 -c "import json; print(json.dumps('$REF_TEXT'))"),
|
|
136
|
+
}
|
|
137
|
+
print(json.dumps(p))
|
|
138
|
+
")
|
|
139
|
+
|
|
140
|
+
HTTP_CODE=$(curl -sf -w '%{http_code}' -o "$OUTFILE.raw" \
|
|
141
|
+
-X POST "$SERVER/v1/audio/speech" \
|
|
142
|
+
-H "Content-Type: application/json" \
|
|
143
|
+
-d "$PAYLOAD" 2>/dev/null || echo "000")
|
|
144
|
+
|
|
145
|
+
if [[ "$HTTP_CODE" == "200" ]]; then
|
|
146
|
+
# Convert to consistent WAV format
|
|
147
|
+
ffmpeg -y -i "$OUTFILE.raw" \
|
|
148
|
+
-ar 24000 -ac 1 -acodec pcm_s16le \
|
|
149
|
+
"$OUTFILE" 2>/dev/null
|
|
150
|
+
rm -f "$OUTFILE.raw"
|
|
151
|
+
|
|
152
|
+
DURATION=$(ffprobe -v error -show_entries format=duration \
|
|
153
|
+
-of csv=p=0 "$OUTFILE" 2>/dev/null)
|
|
154
|
+
printf "done (%.1fs)\n" "$DURATION"
|
|
155
|
+
GENERATED_FILES+=("$OUTFILE")
|
|
156
|
+
else
|
|
157
|
+
rm -f "$OUTFILE.raw"
|
|
158
|
+
printf "FAILED (HTTP %s)\n" "$HTTP_CODE"
|
|
159
|
+
fi
|
|
160
|
+
done < "$CLIPS_LIST"
|
|
161
|
+
|
|
162
|
+
echo ""
|
|
163
|
+
|
|
164
|
+
# Join clips if requested
|
|
165
|
+
if $JOIN && [[ ${#GENERATED_FILES[@]} -gt 1 ]]; then
|
|
166
|
+
JOINED="$OUT_DIR/joined.wav"
|
|
167
|
+
CONCAT_LIST=$(mktemp)
|
|
168
|
+
for f in "${GENERATED_FILES[@]}"; do
|
|
169
|
+
echo "file '$(realpath "$f")'" >> "$CONCAT_LIST"
|
|
170
|
+
done
|
|
171
|
+
|
|
172
|
+
ffmpeg -y -f concat -safe 0 -i "$CONCAT_LIST" \
|
|
173
|
+
-ar 24000 -ac 1 -acodec pcm_s16le \
|
|
174
|
+
"$JOINED" 2>/dev/null
|
|
175
|
+
rm -f "$CONCAT_LIST"
|
|
176
|
+
|
|
177
|
+
TOTAL_DURATION=$(ffprobe -v error -show_entries format=duration \
|
|
178
|
+
-of csv=p=0 "$JOINED" 2>/dev/null)
|
|
179
|
+
echo "Joined clip: $JOINED (${TOTAL_DURATION}s)"
|
|
180
|
+
echo ""
|
|
181
|
+
|
|
182
|
+
if $PLAY; then
|
|
183
|
+
echo "Playing joined clip..."
|
|
184
|
+
ffplay -autoexit -nodisp "$JOINED" 2>/dev/null
|
|
185
|
+
fi
|
|
186
|
+
elif $PLAY && [[ ${#GENERATED_FILES[@]} -gt 0 ]]; then
|
|
187
|
+
LAST_IDX=$(( ${#GENERATED_FILES[@]} - 1 ))
|
|
188
|
+
PLAY_FILE="${GENERATED_FILES[$LAST_IDX]}"
|
|
189
|
+
echo "Playing: $PLAY_FILE"
|
|
190
|
+
ffplay -autoexit -nodisp "$PLAY_FILE" 2>/dev/null
|
|
191
|
+
fi
|
|
192
|
+
|
|
193
|
+
echo "Done! Clips saved to $OUT_DIR/"
|