@argo-video/cli 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Quick voice cloning preview — send voiceover text to mlx-audio server
4
+ # and get back individual clips + an optional joined clip.
5
+ #
6
+ # Usage:
7
+ # ./scripts/voice-clone-preview.sh \
8
+ # --ref-audio ./assets/ref-voice.wav \
9
+ # --ref-text "Hi, my name is Shreyas. I build developer tools." \
10
+ # --voiceover demos/showcase.voiceover.json
11
+ #
12
+ # # Single line of text (no manifest):
13
+ # ./scripts/voice-clone-preview.sh \
14
+ # --ref-audio ./assets/ref-voice.wav \
15
+ # --ref-text "Hi, my name is Shreyas." \
16
+ # --text "Welcome to the demo."
17
+ #
18
+ # Options:
19
+ # --ref-audio PATH Reference voice WAV (required)
20
+ # --ref-text TEXT Transcript of reference audio (required)
21
+ # --voiceover PATH Voiceover JSON manifest (array of {scene, text, speed?, voice?})
22
+ # --text TEXT Single text to synthesize (alternative to --voiceover)
23
+ # --model ID Model ID (default: mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16)
24
+ # --server URL Server URL (default: http://localhost:8000)
25
+ # --out-dir PATH Output directory (default: ./voice-preview)
26
+ # --join Also produce a single joined clip
27
+ # --play Play the output when done (requires ffplay)
28
+ # --voice NAME Default voice (default: af_heart)
29
+ #
30
+ set -euo pipefail
31
+
32
+ # Defaults
33
+ MODEL="mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16"
34
+ SERVER="http://localhost:8000"
35
+ OUT_DIR="./voice-preview"
36
+ REF_AUDIO=""
37
+ REF_TEXT=""
38
+ VOICEOVER=""
39
+ SINGLE_TEXT=""
40
+ JOIN=false
41
+ PLAY=false
42
+ VOICE="af_heart"
43
+
44
+ while [[ $# -gt 0 ]]; do
45
+ case "$1" in
46
+ --ref-audio) REF_AUDIO="$2"; shift 2;;
47
+ --ref-text) REF_TEXT="$2"; shift 2;;
48
+ --voiceover) VOICEOVER="$2"; shift 2;;
49
+ --text) SINGLE_TEXT="$2"; shift 2;;
50
+ --model) MODEL="$2"; shift 2;;
51
+ --server) SERVER="$2"; shift 2;;
52
+ --out-dir) OUT_DIR="$2"; shift 2;;
53
+ --voice) VOICE="$2"; shift 2;;
54
+ --join) JOIN=true; shift;;
55
+ --play) PLAY=true; shift;;
56
+ -h|--help)
57
+ sed -n '2,/^set /{ /^#/s/^# \?//p }' "$0"
58
+ exit 0;;
59
+ *) echo "Unknown option: $1"; exit 1;;
60
+ esac
61
+ done
62
+
63
+ if [[ -z "$REF_AUDIO" || -z "$REF_TEXT" ]]; then
64
+ echo "Error: --ref-audio and --ref-text are required."
65
+ exit 1
66
+ fi
67
+
68
+ if [[ -z "$VOICEOVER" && -z "$SINGLE_TEXT" ]]; then
69
+ echo "Error: provide --voiceover <manifest.json> or --text <string>."
70
+ exit 1
71
+ fi
72
+
73
+ # Check server is running
74
+ if ! curl -sf "$SERVER/v1/audio/speech" -o /dev/null -X POST \
75
+ -H "Content-Type: application/json" \
76
+ -d '{"model":"test","input":"test"}' 2>/dev/null; then
77
+ # It's OK if the request fails with a model error — server is up
78
+ if ! curl -sf --connect-timeout 3 "$SERVER" -o /dev/null 2>/dev/null && \
79
+ ! curl -sf --connect-timeout 3 "$SERVER/docs" -o /dev/null 2>/dev/null; then
80
+ echo "Warning: mlx-audio server may not be running at $SERVER"
81
+ echo "Start it with: python3 -m mlx_audio.server --model $MODEL"
82
+ echo ""
83
+ fi
84
+ fi
85
+
86
+ mkdir -p "$OUT_DIR"
87
+
88
+ # Build the list of clips to generate
89
+ # Format: index|scene_name|text|speed|voice
90
+ CLIPS_LIST=$(mktemp)
91
+ trap 'rm -f "$CLIPS_LIST"' EXIT
92
+
93
+ if [[ -n "$SINGLE_TEXT" ]]; then
94
+ echo "0|single|$SINGLE_TEXT|1.0|$VOICE" > "$CLIPS_LIST"
95
+ else
96
+ # Parse voiceover JSON with python (available on macOS)
97
+ python3 -c "
98
+ import json, sys
99
+ with open('$VOICEOVER') as f:
100
+ scenes = json.load(f)
101
+ for i, s in enumerate(scenes):
102
+ scene = s.get('scene', f'scene-{i}')
103
+ text = s['text'].replace('|', ' ')
104
+ speed = s.get('speed', 1.0)
105
+ voice = s.get('voice', '$VOICE')
106
+ print(f'{i}|{scene}|{text}|{speed}|{voice}')
107
+ " > "$CLIPS_LIST"
108
+ fi
109
+
110
+ TOTAL=$(wc -l < "$CLIPS_LIST" | tr -d ' ')
111
+ echo "Generating $TOTAL clip(s) via $SERVER"
112
+ echo "Model: $MODEL"
113
+ echo "Ref audio: $REF_AUDIO"
114
+ echo "Output: $OUT_DIR/"
115
+ echo ""
116
+
117
+ GENERATED_FILES=()
118
+ IDX=0
119
+
120
+ while IFS='|' read -r _ SCENE TEXT SPEED CLIP_VOICE; do
121
+ IDX=$((IDX + 1))
122
+ OUTFILE="$OUT_DIR/$(printf '%02d' "$IDX")-${SCENE}.wav"
123
+
124
+ printf " [%d/%d] %s ... " "$IDX" "$TOTAL" "$SCENE"
125
+
126
+ # Build JSON payload
127
+ PAYLOAD=$(python3 -c "
128
+ import json
129
+ p = {
130
+ 'model': '$MODEL',
131
+ 'input': $(python3 -c "import json; print(json.dumps('$TEXT'))"),
132
+ 'voice': '$CLIP_VOICE',
133
+ 'speed': $SPEED,
134
+ 'ref_audio': '$REF_AUDIO',
135
+ 'ref_text': $(python3 -c "import json; print(json.dumps('$REF_TEXT'))"),
136
+ }
137
+ print(json.dumps(p))
138
+ ")
139
+
140
+ HTTP_CODE=$(curl -sf -w '%{http_code}' -o "$OUTFILE.raw" \
141
+ -X POST "$SERVER/v1/audio/speech" \
142
+ -H "Content-Type: application/json" \
143
+ -d "$PAYLOAD" 2>/dev/null || echo "000")
144
+
145
+ if [[ "$HTTP_CODE" == "200" ]]; then
146
+ # Convert to consistent WAV format
147
+ ffmpeg -y -i "$OUTFILE.raw" \
148
+ -ar 24000 -ac 1 -acodec pcm_s16le \
149
+ "$OUTFILE" 2>/dev/null
150
+ rm -f "$OUTFILE.raw"
151
+
152
+ DURATION=$(ffprobe -v error -show_entries format=duration \
153
+ -of csv=p=0 "$OUTFILE" 2>/dev/null)
154
+ printf "done (%.1fs)\n" "$DURATION"
155
+ GENERATED_FILES+=("$OUTFILE")
156
+ else
157
+ rm -f "$OUTFILE.raw"
158
+ printf "FAILED (HTTP %s)\n" "$HTTP_CODE"
159
+ fi
160
+ done < "$CLIPS_LIST"
161
+
162
+ echo ""
163
+
164
+ # Join clips if requested
165
+ if $JOIN && [[ ${#GENERATED_FILES[@]} -gt 1 ]]; then
166
+ JOINED="$OUT_DIR/joined.wav"
167
+ CONCAT_LIST=$(mktemp)
168
+ for f in "${GENERATED_FILES[@]}"; do
169
+ echo "file '$(realpath "$f")'" >> "$CONCAT_LIST"
170
+ done
171
+
172
+ ffmpeg -y -f concat -safe 0 -i "$CONCAT_LIST" \
173
+ -ar 24000 -ac 1 -acodec pcm_s16le \
174
+ "$JOINED" 2>/dev/null
175
+ rm -f "$CONCAT_LIST"
176
+
177
+ TOTAL_DURATION=$(ffprobe -v error -show_entries format=duration \
178
+ -of csv=p=0 "$JOINED" 2>/dev/null)
179
+ echo "Joined clip: $JOINED (${TOTAL_DURATION}s)"
180
+ echo ""
181
+
182
+ if $PLAY; then
183
+ echo "Playing joined clip..."
184
+ ffplay -autoexit -nodisp "$JOINED" 2>/dev/null
185
+ fi
186
+ elif $PLAY && [[ ${#GENERATED_FILES[@]} -gt 0 ]]; then
187
+ LAST_IDX=$(( ${#GENERATED_FILES[@]} - 1 ))
188
+ PLAY_FILE="${GENERATED_FILES[$LAST_IDX]}"
189
+ echo "Playing: $PLAY_FILE"
190
+ ffplay -autoexit -nodisp "$PLAY_FILE" 2>/dev/null
191
+ fi
192
+
193
+ echo "Done! Clips saved to $OUT_DIR/"