agentvibes 4.6.2 → 4.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,356 +1,356 @@
1
- #!/usr/bin/env bash
2
- #
3
- # File: .claude/hooks/play-tts-soprano.sh
4
- #
5
- # AgentVibes - Finally, your AI Agents can Talk Back! Text-to-Speech WITH personality for AI Assistants!
6
- # Website: https://agentvibes.org
7
- # Repository: https://github.com/paulpreibisch/AgentVibes
8
- #
9
- # Co-created by Paul Preibisch with Claude AI
10
- # Copyright (c) 2025 Paul Preibisch
11
- #
12
- # Licensed under the Apache License, Version 2.0 (the "License");
13
- # you may not use this file except in compliance with the License.
14
- # You may obtain a copy of the License at
15
- #
16
- # http://www.apache.org/licenses/LICENSE-2.0
17
- #
18
- # Unless required by applicable law or agreed to in writing, software
19
- # distributed under the License is distributed on an "AS IS" BASIS,
20
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
- # See the License for the specific language governing permissions and
22
- # limitations under the License.
23
- #
24
- # DISCLAIMER: This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND,
25
- # express or implied. Use at your own risk. See the Apache License for details.
26
- #
27
- # ---
28
- #
29
- # @fileoverview Soprano TTS Provider Implementation - Free, local, neural-quality TTS
30
- # @context Provides ultra-lightweight on-device neural TTS via Soprano (80M params)
31
- # @architecture Implements provider interface contract with 3 synthesis modes (WebUI/API/CLI)
32
- # @dependencies soprano-tts (pip), soprano-gradio-synth.py, ffmpeg (optional padding), audio players
33
- # @entrypoints Called by play-tts.sh router when provider=soprano
34
- # @patterns Provider contract: text/voice → audio file path, auto-mode detection, Gradio SSE protocol
35
- # @related play-tts.sh, soprano-gradio-synth.py, provider-manager.sh, GitHub Issue #94
36
- #
37
- # Supports three modes (auto-detected in priority order):
38
- # 1. WebUI mode: Gradio WebUI running (soprano-webui), uses Python helper
39
- # 2. API mode: OpenAI-compatible server (uvicorn soprano.server:app), uses curl
40
- # 3. CLI mode: Direct `soprano` command — reloads model each call (slowest)
41
- #
42
- # Environment variables:
43
- # SOPRANO_PORT — WebUI/API port (default: 7860)
44
- # SOPRANO_DEVICE — Device for CLI mode: auto|cuda|cpu|mps (default: auto)
45
- #
46
-
47
- # Fix locale warnings
48
- export LC_ALL=C
49
-
50
- TEXT="$1"
51
- VOICE_OVERRIDE="$2" # Ignored — Soprano has a single voice, kept for provider contract
52
-
53
- # Strip emojis, asterisks, and markdown formatting
54
- TEXT=$(printf '%s' "$TEXT" | perl -CSD -pe '
55
- s/[\x{1F300}-\x{1F9FF}]//g;
56
- s/[\x{2600}-\x{27BF}]//g;
57
- s/[\x{FE00}-\x{FE0F}]//g;
58
- s/[\x{200D}]//g;
59
- s/[\x{2500}-\x{257F}]//g;
60
- s/[\x{2580}-\x{259F}]//g;
61
- s/\*+//g; s/#+\s*//g; s/`//g; s/~+//g; s/^\s*[-]\s*//g;
62
- ')
63
-
64
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
65
- source "$SCRIPT_DIR/audio-cache-utils.sh"
66
-
67
- SOPRANO_PORT="${SOPRANO_PORT:-7860}"
68
- SOPRANO_DEVICE="${SOPRANO_DEVICE:-auto}"
69
-
70
- # @function validate_inputs
71
- # @intent Check required parameters
72
- # @why Fail fast with clear errors if inputs missing
73
- # @exitcode 1=missing text
74
- if [[ -z "$TEXT" ]]; then
75
- echo "Usage: $0 \"text to speak\" [voice_override]"
76
- exit 1
77
- fi
78
-
79
- # @function check_webui_server
80
- # @intent Detect if Soprano Gradio WebUI is reachable
81
- # @why WebUI mode keeps model in memory for fastest repeated synthesis
82
- # @returns exitcode 0=reachable, 1=not reachable
83
- check_webui_server() {
84
- curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/gradio_api/info" -o /dev/null 2>/dev/null ||
85
- curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/info" -o /dev/null 2>/dev/null
86
- }
87
-
88
- # @function check_api_server
89
- # @intent Detect if Soprano OpenAI-compatible API server is reachable
90
- # @why API mode is simpler than WebUI (direct WAV response, no SSE polling)
91
- # @returns exitcode 0=reachable, 1=not reachable
92
- check_api_server() {
93
- curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/v1/audio/speech" \
94
- -H "Content-Type: application/json" \
95
- -d '{"input":"test"}' -o /dev/null 2>/dev/null
96
- }
97
-
98
- # @function check_soprano_available
99
- # @intent Verify at least one synthesis mode is available
100
- # @why Provide helpful installation instructions if nothing works
101
- # @exitcode 2=soprano not installed and no server running
102
- if ! command -v soprano &>/dev/null && ! check_webui_server && ! check_api_server; then
103
- echo "❌ Error: Soprano TTS not installed and no server running on port $SOPRANO_PORT"
104
- echo ""
105
- echo "Install: pip install soprano-tts"
106
- echo " (GPU): pip install soprano-tts[lmdeploy]"
107
- echo ""
108
- echo "Start WebUI: soprano-webui"
109
- echo "Start API: uvicorn soprano.server:app --host 127.0.0.1 --port $SOPRANO_PORT"
110
- exit 2
111
- fi
112
-
113
- # @function determine_audio_directory
114
- # @intent Find appropriate directory for audio file storage
115
- # @why Supports project-local and global storage
116
- # @returns Sets $AUDIO_DIR global variable
117
- # SECURITY: Canonicalize path to prevent traversal (#128)
118
- if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
119
- CLAUDE_PROJECT_DIR=$(cd "${CLAUDE_PROJECT_DIR}" 2>/dev/null && pwd -P) || CLAUDE_PROJECT_DIR=""
120
- fi
121
- if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
122
- AUDIO_DIR="$CLAUDE_PROJECT_DIR/.claude/audio"
123
- else
124
- CURRENT_DIR="$PWD"
125
- while [[ "$CURRENT_DIR" != "/" ]]; do
126
- if [[ -d "$CURRENT_DIR/.claude" ]]; then
127
- AUDIO_DIR="$CURRENT_DIR/.claude/audio"
128
- break
129
- fi
130
- CURRENT_DIR=$(dirname "$CURRENT_DIR")
131
- done
132
- if [[ -z "$AUDIO_DIR" ]]; then
133
- AUDIO_DIR="$HOME/.claude/audio"
134
- fi
135
- fi
136
-
137
- mkdir -p "$AUDIO_DIR"
138
- # SECURITY: Use mktemp for unpredictable filenames (#130)
139
- TEMP_FILE=$(mktemp "$AUDIO_DIR/tts-XXXXXX.wav")
140
-
141
- # @function synthesize_speech
142
- # @intent Generate speech using best available Soprano mode
143
- # @why Auto-detect WebUI → API → CLI for optimal performance
144
- # @param Uses globals: $TEXT, $SOPRANO_PORT, $SOPRANO_DEVICE
145
- # @returns Creates WAV file at $TEMP_FILE, sets $SYNTH_MODE
146
- # @exitcode 4=synthesis error
147
- SYNTH_MODE=""
148
-
149
- if check_webui_server; then
150
- # Gradio WebUI mode — use Python helper for SSE protocol
151
- SYNTH_MODE="webui"
152
- python3 "$SCRIPT_DIR/soprano-gradio-synth.py" "$TEXT" "$TEMP_FILE" "$SOPRANO_PORT" 2>/dev/null
153
- elif check_api_server; then
154
- # OpenAI-compatible API mode — direct curl
155
- SYNTH_MODE="api"
156
- # SECURITY: Use proper JSON encoding to prevent injection (#133)
157
- _JSON_PAYLOAD=$(printf '%s' "$TEXT" | python3 -c 'import sys,json; print(json.dumps({"input":sys.stdin.read()}))' 2>/dev/null) || \
158
- _JSON_PAYLOAD=$(printf '{"input":"%s"}' "$(printf '%s' "$TEXT" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g')")
159
- curl -sf "http://127.0.0.1:${SOPRANO_PORT}/v1/audio/speech" \
160
- -H "Content-Type: application/json" \
161
- -d "$_JSON_PAYLOAD" \
162
- --output "$TEMP_FILE" 2>/dev/null
163
- else
164
- # CLI fallback — reloads model each call (slowest)
165
- SYNTH_MODE="cli"
166
- soprano "$TEXT" -o "$TEMP_FILE" -d "$SOPRANO_DEVICE" 2>/dev/null
167
- fi
168
-
169
- if [[ ! -f "$TEMP_FILE" ]] || [[ ! -s "$TEMP_FILE" ]]; then
170
- echo "❌ Failed to synthesize speech with Soprano ($SYNTH_MODE mode)"
171
- [[ "$SYNTH_MODE" == "webui" ]] && echo " Try: python3 $SCRIPT_DIR/soprano-gradio-synth.py \"test\" /tmp/test.wav $SOPRANO_PORT"
172
- exit 4
173
- fi
174
-
175
- # @function detect_remote_session
176
- # @intent Auto-detect SSH/RDP sessions and enable audio compression
177
- # @why Remote desktop audio is choppy without compression
178
- # @returns Sets AGENTVIBES_RDP_MODE environment variable
179
- if [[ -z "${AGENTVIBES_RDP_MODE:-}" ]]; then
180
- if [[ -n "${SSH_CLIENT:-}" ]] || [[ -n "${SSH_TTY:-}" ]] || [[ "${DISPLAY:-}" =~ ^localhost:.* ]]; then
181
- export AGENTVIBES_RDP_MODE=true
182
- echo "🌐 Remote session detected - enabling audio compression"
183
- fi
184
- fi
185
-
186
- # @function compress_for_remote
187
- # @intent Compress TTS audio for remote sessions (SSH/RDP)
188
- # @why Reduces bandwidth and prevents choppy playback
189
- if [[ "${AGENTVIBES_RDP_MODE:-false}" == "true" ]] && command -v ffmpeg &>/dev/null; then
190
- COMPRESSED_FILE=$(mktemp "$AUDIO_DIR/tts-compressed-XXXXXX.wav")
191
- ffmpeg -i "$TEMP_FILE" -ac 1 -ar 22050 -b:a 64k -y "$COMPRESSED_FILE" 2>/dev/null
192
- if [[ -f "$COMPRESSED_FILE" ]]; then
193
- rm -f "$TEMP_FILE"
194
- TEMP_FILE="$COMPRESSED_FILE"
195
- fi
196
- fi
197
-
198
- # @function add_silence_padding
199
- # @intent Add silence to prevent WSL audio static
200
- # @why WSL audio subsystem cuts off first ~200ms
201
- if command -v ffmpeg &>/dev/null; then
202
- PADDED_FILE=$(mktemp "$AUDIO_DIR/tts-padded-XXXXXX.wav")
203
- ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo:d=0.2 -i "$TEMP_FILE" \
204
- -filter_complex "[0:a][1:a]concat=n=2:v=0:a=1[out]" \
205
- -map "[out]" -y "$PADDED_FILE" 2>/dev/null
206
- if [[ -f "$PADDED_FILE" ]]; then
207
- rm -f "$TEMP_FILE"
208
- TEMP_FILE="$PADDED_FILE"
209
- fi
210
- fi
211
-
212
- # @function apply_audio_effects
213
- # @intent Apply sox effects and background music via audio-processor.sh
214
- # @param Uses global: $TEMP_FILE
215
- # @returns Updates $TEMP_FILE to processed version
216
- BACKGROUND_MUSIC=""
217
- if [[ -f "$SCRIPT_DIR/audio-processor.sh" ]]; then
218
- PROCESSED_FILE="$AUDIO_DIR/tts-processed-$(date +%s).wav"
219
- PROCESSOR_OUTPUT=$("$SCRIPT_DIR/audio-processor.sh" "$TEMP_FILE" "default" "$PROCESSED_FILE" 2>/dev/null) || {
220
- PROCESSED_FILE="$TEMP_FILE"
221
- PROCESSOR_OUTPUT="$TEMP_FILE|"
222
- }
223
- PROCESSED_FILE="${PROCESSOR_OUTPUT%%|*}"
224
- BACKGROUND_MUSIC="${PROCESSOR_OUTPUT##*|}"
225
- if [[ -f "$PROCESSED_FILE" ]] && [[ "$PROCESSED_FILE" != "$TEMP_FILE" ]]; then
226
- rm -f "$TEMP_FILE"
227
- TEMP_FILE="$PROCESSED_FILE"
228
- fi
229
- fi
230
-
231
- # @function play_audio
232
- # @intent Play generated audio using available player with sequential playback
233
- # @why Support multiple audio players and prevent overlapping audio
234
- # SECURITY: Use user-isolated lock directory (#129)
235
- _LOCK_DIR="${XDG_RUNTIME_DIR:-/tmp/agentvibes-$(id -u)}"
236
- mkdir -p "$_LOCK_DIR"
237
- chmod 700 "$_LOCK_DIR"
238
- LOCK_FILE="$_LOCK_DIR/agentvibes-audio.lock"
239
-
240
- # Auto-remove stale lock files (older than 30 seconds)
241
- if [ -f "$LOCK_FILE" ]; then
242
- if [[ "$(uname)" == "Darwin" ]]; then
243
- _lock_mtime=$(stat -f %m "$LOCK_FILE" 2>/dev/null || echo 0)
244
- else
245
- _lock_mtime=$(stat -c %Y "$LOCK_FILE" 2>/dev/null || echo 0)
246
- fi
247
- _lock_age=$(( $(date +%s) - _lock_mtime ))
248
- if [[ $_lock_age -gt 30 ]]; then
249
- rm -f "$LOCK_FILE"
250
- fi
251
- fi
252
-
253
- for i in {1..4}; do
254
- if [ ! -f "$LOCK_FILE" ]; then
255
- break
256
- fi
257
- sleep 0.5
258
- done
259
-
260
- if [ -f "$LOCK_FILE" ]; then
261
- echo "⏭️ Skipping TTS (previous audio still playing)" >&2
262
- exit 0
263
- fi
264
-
265
- touch "$LOCK_FILE"
266
-
267
- AUDIO_DIR_PLAY="${TEMP_FILE%/*}"
268
- WRITE_LOCK_FILE="$AUDIO_DIR_PLAY/$(basename "$TEMP_FILE" .wav).lock"
269
- touch "$WRITE_LOCK_FILE"
270
-
271
- DURATION=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$TEMP_FILE" 2>/dev/null)
272
- DURATION=${DURATION%.*}
273
- DURATION=${DURATION:-1}
274
-
275
- if [[ "${AGENTVIBES_TEST_MODE:-false}" != "true" ]] && [[ "${AGENTVIBES_NO_PLAYBACK:-false}" != "true" ]]; then
276
- if [[ "$(uname -s)" == "Darwin" ]]; then
277
- afplay "$TEMP_FILE" >/dev/null 2>&1 &
278
- PLAYER_PID=$!
279
- elif [[ -n "${TERMUX_VERSION:-}" ]] || [[ -d "/data/data/com.termux" ]]; then
280
- termux-media-player play "$TEMP_FILE" >/dev/null 2>&1 &
281
- PLAYER_PID=$!
282
- else
283
- (paplay "$TEMP_FILE" || mpv "$TEMP_FILE" || aplay "$TEMP_FILE") >/dev/null 2>&1 &
284
- PLAYER_PID=$!
285
- fi
286
- fi
287
-
288
- (sleep $DURATION; rm -f "$LOCK_FILE" "$WRITE_LOCK_FILE") &
289
- disown
290
-
291
- # @function display_cache_stats
292
- # @intent Show audio cache statistics with color-coded output
293
- AUDIO_DIR_PATH=$(get_audio_dir)
294
-
295
- BLUE='\033[0;34m'
296
- YELLOW='\033[1;33m'
297
- PURPLE='\033[0;35m'
298
- RED='\033[0;31m'
299
- GREEN='\033[0;32m'
300
- ORANGE='\033[0;33m'
301
- WHITE='\033[1;37m'
302
- CYAN='\033[0;36m'
303
- GOLD='\033[38;5;226m'
304
- NC='\033[0m'
305
-
306
- AUTO_CLEAN_THRESHOLD=$(get_auto_clean_threshold)
307
- INITIAL_SIZE=$(calculate_tts_size_bytes "$AUDIO_DIR_PATH")
308
- if [[ $INITIAL_SIZE -gt $((AUTO_CLEAN_THRESHOLD * 1048576)) ]]; then
309
- DELETED=$(auto_clean_old_files "$AUDIO_DIR_PATH" "$AUTO_CLEAN_THRESHOLD")
310
- if [[ $DELETED -gt 0 ]]; then
311
- echo -e "${ORANGE}🧹 Auto-cleaned $DELETED old files${NC}"
312
- fi
313
- fi
314
-
315
- FILE_COUNT=$(count_tts_files "$AUDIO_DIR_PATH")
316
- SIZE_BYTES=$(calculate_tts_size_bytes "$AUDIO_DIR_PATH")
317
- SIZE_HUMAN=$(bytes_to_human "$SIZE_BYTES")
318
-
319
- CACHE_COLOR=$GREEN
320
- if [[ $SIZE_BYTES -gt 3221225472 ]]; then
321
- CACHE_COLOR=$RED
322
- elif [[ $SIZE_BYTES -gt 524288000 ]]; then
323
- CACHE_COLOR=$YELLOW
324
- fi
325
-
326
- echo -e "${WHITE}💾 Saved to:${NC} ${CYAN}$TEMP_FILE${NC} ${YELLOW}$FILE_COUNT${NC} ${WHITE}🗄️${NC} ${CACHE_COLOR}$SIZE_HUMAN${NC} ${WHITE}🧹${NC}${GOLD}[${AUTO_CLEAN_THRESHOLD}mb]${NC}"
327
-
328
- if [[ -n "$BACKGROUND_MUSIC" ]]; then
329
- MUSIC_FILENAME=$(basename "$BACKGROUND_MUSIC")
330
- echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}$MUSIC_FILENAME${NC}"
331
- fi
332
- echo -e "${WHITE}🎤 Voice:${NC} ${BLUE}Soprano-1.1-80M${NC} ${WHITE}(Soprano TTS, ${SYNTH_MODE} mode)${NC}"
333
-
334
- # Show personality if configured
335
- PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
336
- PERSONALITY=$(cat "$PROJECT_ROOT/.claude/tts-personality.txt" 2>/dev/null || cat "$HOME/.claude/tts-personality.txt" 2>/dev/null || echo "")
337
- if [[ -n "$PERSONALITY" ]] && [[ "$PERSONALITY" != "none" ]] && [[ "$PERSONALITY" != "normal" ]]; then
338
- echo -e "${WHITE}💫 Personality:${NC} ${YELLOW}$PERSONALITY${NC}"
339
- fi
340
-
341
- if [[ -d "$AUDIO_DIR_PATH" ]]; then
342
- AUDIO_SIZE=$(du -sm "$AUDIO_DIR_PATH" 2>/dev/null | cut -f1)
343
- if [[ -n "$AUDIO_SIZE" ]] && [[ "$AUDIO_SIZE" -gt 100 ]]; then
344
- echo -e "\033[0;31m⚠️ Audio cache is ${AUDIO_SIZE}MB - Run: /agent-vibes:cleanup\033[0m"
345
- fi
346
- fi
347
-
348
- # Background music status
349
- if [[ -z "$BACKGROUND_MUSIC" ]]; then
350
- BACKGROUND_ENABLED_FILE="$PROJECT_ROOT/.claude/config/background-music-enabled.txt"
351
- if [[ -f "$BACKGROUND_ENABLED_FILE" ]] && grep -q "true" "$BACKGROUND_ENABLED_FILE" 2>/dev/null; then
352
- echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}Enabled but not playing (check config)${NC}"
353
- else
354
- echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}Disabled${NC}"
355
- fi
356
- fi
1
+ #!/usr/bin/env bash
2
+ #
3
+ # File: .claude/hooks/play-tts-soprano.sh
4
+ #
5
+ # AgentVibes - Finally, your AI Agents can Talk Back! Text-to-Speech WITH personality for AI Assistants!
6
+ # Website: https://agentvibes.org
7
+ # Repository: https://github.com/paulpreibisch/AgentVibes
8
+ #
9
+ # Co-created by Paul Preibisch with Claude AI
10
+ # Copyright (c) 2025 Paul Preibisch
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License");
13
+ # you may not use this file except in compliance with the License.
14
+ # You may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
+ # See the License for the specific language governing permissions and
22
+ # limitations under the License.
23
+ #
24
+ # DISCLAIMER: This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND,
25
+ # express or implied. Use at your own risk. See the Apache License for details.
26
+ #
27
+ # ---
28
+ #
29
+ # @fileoverview Soprano TTS Provider Implementation - Free, local, neural-quality TTS
30
+ # @context Provides ultra-lightweight on-device neural TTS via Soprano (80M params)
31
+ # @architecture Implements provider interface contract with 3 synthesis modes (WebUI/API/CLI)
32
+ # @dependencies soprano-tts (pip), soprano-gradio-synth.py, ffmpeg (optional padding), audio players
33
+ # @entrypoints Called by play-tts.sh router when provider=soprano
34
+ # @patterns Provider contract: text/voice → audio file path, auto-mode detection, Gradio SSE protocol
35
+ # @related play-tts.sh, soprano-gradio-synth.py, provider-manager.sh, GitHub Issue #94
36
+ #
37
+ # Supports three modes (auto-detected in priority order):
38
+ # 1. WebUI mode: Gradio WebUI running (soprano-webui), uses Python helper
39
+ # 2. API mode: OpenAI-compatible server (uvicorn soprano.server:app), uses curl
40
+ # 3. CLI mode: Direct `soprano` command — reloads model each call (slowest)
41
+ #
42
+ # Environment variables:
43
+ # SOPRANO_PORT — WebUI/API port (default: 7860)
44
+ # SOPRANO_DEVICE — Device for CLI mode: auto|cuda|cpu|mps (default: auto)
45
+ #
46
+
47
+ # Fix locale warnings
48
+ export LC_ALL=C
49
+
50
+ TEXT="$1"
51
+ VOICE_OVERRIDE="$2" # Ignored — Soprano has a single voice, kept for provider contract
52
+
53
+ # Strip emojis, asterisks, and markdown formatting
54
+ TEXT=$(printf '%s' "$TEXT" | perl -CSD -pe '
55
+ s/[\x{1F300}-\x{1F9FF}]//g;
56
+ s/[\x{2600}-\x{27BF}]//g;
57
+ s/[\x{FE00}-\x{FE0F}]//g;
58
+ s/[\x{200D}]//g;
59
+ s/[\x{2500}-\x{257F}]//g;
60
+ s/[\x{2580}-\x{259F}]//g;
61
+ s/\*+//g; s/#+\s*//g; s/`//g; s/~+//g; s/^\s*[-]\s*//g;
62
+ ')
63
+
64
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
65
+ source "$SCRIPT_DIR/audio-cache-utils.sh"
66
+
67
+ SOPRANO_PORT="${SOPRANO_PORT:-7860}"
68
+ SOPRANO_DEVICE="${SOPRANO_DEVICE:-auto}"
69
+
70
+ # @function validate_inputs
71
+ # @intent Check required parameters
72
+ # @why Fail fast with clear errors if inputs missing
73
+ # @exitcode 1=missing text
74
+ if [[ -z "$TEXT" ]]; then
75
+ echo "Usage: $0 \"text to speak\" [voice_override]"
76
+ exit 1
77
+ fi
78
+
79
+ # @function check_webui_server
80
+ # @intent Detect if Soprano Gradio WebUI is reachable
81
+ # @why WebUI mode keeps model in memory for fastest repeated synthesis
82
+ # @returns exitcode 0=reachable, 1=not reachable
83
+ check_webui_server() {
84
+ curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/gradio_api/info" -o /dev/null 2>/dev/null ||
85
+ curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/info" -o /dev/null 2>/dev/null
86
+ }
87
+
88
+ # @function check_api_server
89
+ # @intent Detect if Soprano OpenAI-compatible API server is reachable
90
+ # @why API mode is simpler than WebUI (direct WAV response, no SSE polling)
91
+ # @returns exitcode 0=reachable, 1=not reachable
92
+ check_api_server() {
93
+ curl -sf --max-time 2 "http://127.0.0.1:${SOPRANO_PORT}/v1/audio/speech" \
94
+ -H "Content-Type: application/json" \
95
+ -d '{"input":"test"}' -o /dev/null 2>/dev/null
96
+ }
97
+
98
+ # @function check_soprano_available
99
+ # @intent Verify at least one synthesis mode is available
100
+ # @why Provide helpful installation instructions if nothing works
101
+ # @exitcode 2=soprano not installed and no server running
102
+ if ! command -v soprano &>/dev/null && ! check_webui_server && ! check_api_server; then
103
+ echo "❌ Error: Soprano TTS not installed and no server running on port $SOPRANO_PORT"
104
+ echo ""
105
+ echo "Install: pip install soprano-tts"
106
+ echo " (GPU): pip install soprano-tts[lmdeploy]"
107
+ echo ""
108
+ echo "Start WebUI: soprano-webui"
109
+ echo "Start API: uvicorn soprano.server:app --host 127.0.0.1 --port $SOPRANO_PORT"
110
+ exit 2
111
+ fi
112
+
113
+ # @function determine_audio_directory
114
+ # @intent Find appropriate directory for audio file storage
115
+ # @why Supports project-local and global storage
116
+ # @returns Sets $AUDIO_DIR global variable
117
+ # SECURITY: Canonicalize path to prevent traversal (#128)
118
+ if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
119
+ CLAUDE_PROJECT_DIR=$(cd "${CLAUDE_PROJECT_DIR}" 2>/dev/null && pwd -P) || CLAUDE_PROJECT_DIR=""
120
+ fi
121
+ if [[ -n "${CLAUDE_PROJECT_DIR:-}" ]]; then
122
+ AUDIO_DIR="$CLAUDE_PROJECT_DIR/.claude/audio"
123
+ else
124
+ CURRENT_DIR="$PWD"
125
+ while [[ "$CURRENT_DIR" != "/" ]]; do
126
+ if [[ -d "$CURRENT_DIR/.claude" ]]; then
127
+ AUDIO_DIR="$CURRENT_DIR/.claude/audio"
128
+ break
129
+ fi
130
+ CURRENT_DIR=$(dirname "$CURRENT_DIR")
131
+ done
132
+ if [[ -z "$AUDIO_DIR" ]]; then
133
+ AUDIO_DIR="$HOME/.claude/audio"
134
+ fi
135
+ fi
136
+
137
+ mkdir -p "$AUDIO_DIR"
138
+ # SECURITY: Use mktemp for unpredictable filenames (#130)
139
+ _tmp=$(mktemp "$AUDIO_DIR/tts-XXXXXX"); TEMP_FILE="${_tmp}.wav"; mv "$_tmp" "$TEMP_FILE"
140
+
141
+ # @function synthesize_speech
142
+ # @intent Generate speech using best available Soprano mode
143
+ # @why Auto-detect WebUI → API → CLI for optimal performance
144
+ # @param Uses globals: $TEXT, $SOPRANO_PORT, $SOPRANO_DEVICE
145
+ # @returns Creates WAV file at $TEMP_FILE, sets $SYNTH_MODE
146
+ # @exitcode 4=synthesis error
147
+ SYNTH_MODE=""
148
+
149
+ if check_webui_server; then
150
+ # Gradio WebUI mode — use Python helper for SSE protocol
151
+ SYNTH_MODE="webui"
152
+ python3 "$SCRIPT_DIR/soprano-gradio-synth.py" "$TEXT" "$TEMP_FILE" "$SOPRANO_PORT" 2>/dev/null
153
+ elif check_api_server; then
154
+ # OpenAI-compatible API mode — direct curl
155
+ SYNTH_MODE="api"
156
+ # SECURITY: Use proper JSON encoding to prevent injection (#133)
157
+ _JSON_PAYLOAD=$(printf '%s' "$TEXT" | python3 -c 'import sys,json; print(json.dumps({"input":sys.stdin.read()}))' 2>/dev/null) || \
158
+ _JSON_PAYLOAD=$(printf '{"input":"%s"}' "$(printf '%s' "$TEXT" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g')")
159
+ curl -sf "http://127.0.0.1:${SOPRANO_PORT}/v1/audio/speech" \
160
+ -H "Content-Type: application/json" \
161
+ -d "$_JSON_PAYLOAD" \
162
+ --output "$TEMP_FILE" 2>/dev/null
163
+ else
164
+ # CLI fallback — reloads model each call (slowest)
165
+ SYNTH_MODE="cli"
166
+ soprano "$TEXT" -o "$TEMP_FILE" -d "$SOPRANO_DEVICE" 2>/dev/null
167
+ fi
168
+
169
+ if [[ ! -f "$TEMP_FILE" ]] || [[ ! -s "$TEMP_FILE" ]]; then
170
+ echo "❌ Failed to synthesize speech with Soprano ($SYNTH_MODE mode)"
171
+ [[ "$SYNTH_MODE" == "webui" ]] && echo " Try: python3 $SCRIPT_DIR/soprano-gradio-synth.py \"test\" /tmp/test.wav $SOPRANO_PORT"
172
+ exit 4
173
+ fi
174
+
175
+ # @function detect_remote_session
176
+ # @intent Auto-detect SSH/RDP sessions and enable audio compression
177
+ # @why Remote desktop audio is choppy without compression
178
+ # @returns Sets AGENTVIBES_RDP_MODE environment variable
179
+ if [[ -z "${AGENTVIBES_RDP_MODE:-}" ]]; then
180
+ if [[ -n "${SSH_CLIENT:-}" ]] || [[ -n "${SSH_TTY:-}" ]] || [[ "${DISPLAY:-}" =~ ^localhost:.* ]]; then
181
+ export AGENTVIBES_RDP_MODE=true
182
+ echo "🌐 Remote session detected - enabling audio compression"
183
+ fi
184
+ fi
185
+
186
+ # @function compress_for_remote
187
+ # @intent Compress TTS audio for remote sessions (SSH/RDP)
188
+ # @why Reduces bandwidth and prevents choppy playback
189
+ if [[ "${AGENTVIBES_RDP_MODE:-false}" == "true" ]] && command -v ffmpeg &>/dev/null; then
190
+ _tmp=$(mktemp "$AUDIO_DIR/tts-compressed-XXXXXX"); COMPRESSED_FILE="${_tmp}.wav"; mv "$_tmp" "$COMPRESSED_FILE"
191
+ ffmpeg -i "$TEMP_FILE" -ac 1 -ar 22050 -b:a 64k -y "$COMPRESSED_FILE" 2>/dev/null
192
+ if [[ -f "$COMPRESSED_FILE" ]]; then
193
+ rm -f "$TEMP_FILE"
194
+ TEMP_FILE="$COMPRESSED_FILE"
195
+ fi
196
+ fi
197
+
198
+ # @function add_silence_padding
199
+ # @intent Add silence to prevent WSL audio static
200
+ # @why WSL audio subsystem cuts off first ~200ms
201
+ if command -v ffmpeg &>/dev/null; then
202
+ _tmp=$(mktemp "$AUDIO_DIR/tts-padded-XXXXXX"); PADDED_FILE="${_tmp}.wav"; mv "$_tmp" "$PADDED_FILE"
203
+ ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo:d=0.2 -i "$TEMP_FILE" \
204
+ -filter_complex "[0:a][1:a]concat=n=2:v=0:a=1[out]" \
205
+ -map "[out]" -y "$PADDED_FILE" 2>/dev/null
206
+ if [[ -f "$PADDED_FILE" ]]; then
207
+ rm -f "$TEMP_FILE"
208
+ TEMP_FILE="$PADDED_FILE"
209
+ fi
210
+ fi
211
+
212
+ # @function apply_audio_effects
213
+ # @intent Apply sox effects and background music via audio-processor.sh
214
+ # @param Uses global: $TEMP_FILE
215
+ # @returns Updates $TEMP_FILE to processed version
216
+ BACKGROUND_MUSIC=""
217
+ if [[ -f "$SCRIPT_DIR/audio-processor.sh" ]]; then
218
+ PROCESSED_FILE="$AUDIO_DIR/tts-processed-$(date +%s).wav"
219
+ PROCESSOR_OUTPUT=$("$SCRIPT_DIR/audio-processor.sh" "$TEMP_FILE" "default" "$PROCESSED_FILE" 2>/dev/null) || {
220
+ PROCESSED_FILE="$TEMP_FILE"
221
+ PROCESSOR_OUTPUT="$TEMP_FILE|"
222
+ }
223
+ PROCESSED_FILE="${PROCESSOR_OUTPUT%%|*}"
224
+ BACKGROUND_MUSIC="${PROCESSOR_OUTPUT##*|}"
225
+ if [[ -f "$PROCESSED_FILE" ]] && [[ "$PROCESSED_FILE" != "$TEMP_FILE" ]]; then
226
+ rm -f "$TEMP_FILE"
227
+ TEMP_FILE="$PROCESSED_FILE"
228
+ fi
229
+ fi
230
+
231
+ # @function play_audio
232
+ # @intent Play generated audio using available player with sequential playback
233
+ # @why Support multiple audio players and prevent overlapping audio
234
+ # SECURITY: Use user-isolated lock directory (#129)
235
+ _LOCK_DIR="${XDG_RUNTIME_DIR:-/tmp/agentvibes-$(id -u)}"
236
+ mkdir -p "$_LOCK_DIR"
237
+ chmod 700 "$_LOCK_DIR"
238
+ LOCK_FILE="$_LOCK_DIR/agentvibes-audio.lock"
239
+
240
+ # Auto-remove stale lock files (older than 30 seconds)
241
+ if [ -f "$LOCK_FILE" ]; then
242
+ if [[ "$(uname)" == "Darwin" ]]; then
243
+ _lock_mtime=$(stat -f %m "$LOCK_FILE" 2>/dev/null || echo 0)
244
+ else
245
+ _lock_mtime=$(stat -c %Y "$LOCK_FILE" 2>/dev/null || echo 0)
246
+ fi
247
+ _lock_age=$(( $(date +%s) - _lock_mtime ))
248
+ if [[ $_lock_age -gt 30 ]]; then
249
+ rm -f "$LOCK_FILE"
250
+ fi
251
+ fi
252
+
253
+ for i in {1..4}; do
254
+ if [ ! -f "$LOCK_FILE" ]; then
255
+ break
256
+ fi
257
+ sleep 0.5
258
+ done
259
+
260
+ if [ -f "$LOCK_FILE" ]; then
261
+ echo "⏭️ Skipping TTS (previous audio still playing)" >&2
262
+ exit 0
263
+ fi
264
+
265
+ touch "$LOCK_FILE"
266
+
267
+ AUDIO_DIR_PLAY="${TEMP_FILE%/*}"
268
+ WRITE_LOCK_FILE="$AUDIO_DIR_PLAY/$(basename "$TEMP_FILE" .wav).lock"
269
+ touch "$WRITE_LOCK_FILE"
270
+
271
+ DURATION=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$TEMP_FILE" 2>/dev/null)
272
+ DURATION=${DURATION%.*}
273
+ DURATION=${DURATION:-1}
274
+
275
+ if [[ "${AGENTVIBES_TEST_MODE:-false}" != "true" ]] && [[ "${AGENTVIBES_NO_PLAYBACK:-false}" != "true" ]]; then
276
+ if [[ "$(uname -s)" == "Darwin" ]]; then
277
+ afplay "$TEMP_FILE" >/dev/null 2>&1 &
278
+ PLAYER_PID=$!
279
+ elif [[ -n "${TERMUX_VERSION:-}" ]] || [[ -d "/data/data/com.termux" ]]; then
280
+ termux-media-player play "$TEMP_FILE" >/dev/null 2>&1 &
281
+ PLAYER_PID=$!
282
+ else
283
+ (paplay "$TEMP_FILE" || mpv "$TEMP_FILE" || aplay "$TEMP_FILE") >/dev/null 2>&1 &
284
+ PLAYER_PID=$!
285
+ fi
286
+ fi
287
+
288
+ (sleep $DURATION; rm -f "$LOCK_FILE" "$WRITE_LOCK_FILE") &
289
+ disown
290
+
291
+ # @function display_cache_stats
292
+ # @intent Show audio cache statistics with color-coded output
293
+ AUDIO_DIR_PATH=$(get_audio_dir)
294
+
295
+ BLUE='\033[0;34m'
296
+ YELLOW='\033[1;33m'
297
+ PURPLE='\033[0;35m'
298
+ RED='\033[0;31m'
299
+ GREEN='\033[0;32m'
300
+ ORANGE='\033[0;33m'
301
+ WHITE='\033[1;37m'
302
+ CYAN='\033[0;36m'
303
+ GOLD='\033[38;5;226m'
304
+ NC='\033[0m'
305
+
306
+ AUTO_CLEAN_THRESHOLD=$(get_auto_clean_threshold)
307
+ INITIAL_SIZE=$(calculate_tts_size_bytes "$AUDIO_DIR_PATH")
308
+ if [[ $INITIAL_SIZE -gt $((AUTO_CLEAN_THRESHOLD * 1048576)) ]]; then
309
+ DELETED=$(auto_clean_old_files "$AUDIO_DIR_PATH" "$AUTO_CLEAN_THRESHOLD")
310
+ if [[ $DELETED -gt 0 ]]; then
311
+ echo -e "${ORANGE}🧹 Auto-cleaned $DELETED old files${NC}"
312
+ fi
313
+ fi
314
+
315
+ FILE_COUNT=$(count_tts_files "$AUDIO_DIR_PATH")
316
+ SIZE_BYTES=$(calculate_tts_size_bytes "$AUDIO_DIR_PATH")
317
+ SIZE_HUMAN=$(bytes_to_human "$SIZE_BYTES")
318
+
319
+ CACHE_COLOR=$GREEN
320
+ if [[ $SIZE_BYTES -gt 3221225472 ]]; then
321
+ CACHE_COLOR=$RED
322
+ elif [[ $SIZE_BYTES -gt 524288000 ]]; then
323
+ CACHE_COLOR=$YELLOW
324
+ fi
325
+
326
+ echo -e "${WHITE}💾 Saved to:${NC} ${CYAN}$TEMP_FILE${NC} ${YELLOW}$FILE_COUNT${NC} ${WHITE}🗄️${NC} ${CACHE_COLOR}$SIZE_HUMAN${NC} ${WHITE}🧹${NC}${GOLD}[${AUTO_CLEAN_THRESHOLD}mb]${NC}"
327
+
328
+ if [[ -n "$BACKGROUND_MUSIC" ]]; then
329
+ MUSIC_FILENAME=$(basename "$BACKGROUND_MUSIC")
330
+ echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}$MUSIC_FILENAME${NC}"
331
+ fi
332
+ echo -e "${WHITE}🎤 Voice:${NC} ${BLUE}Soprano-1.1-80M${NC} ${WHITE}(Soprano TTS, ${SYNTH_MODE} mode)${NC}"
333
+
334
+ # Show personality if configured
335
+ PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
336
+ PERSONALITY=$(cat "$PROJECT_ROOT/.claude/tts-personality.txt" 2>/dev/null || cat "$HOME/.claude/tts-personality.txt" 2>/dev/null || echo "")
337
+ if [[ -n "$PERSONALITY" ]] && [[ "$PERSONALITY" != "none" ]] && [[ "$PERSONALITY" != "normal" ]]; then
338
+ echo -e "${WHITE}💫 Personality:${NC} ${YELLOW}$PERSONALITY${NC}"
339
+ fi
340
+
341
+ if [[ -d "$AUDIO_DIR_PATH" ]]; then
342
+ AUDIO_SIZE=$(du -sm "$AUDIO_DIR_PATH" 2>/dev/null | cut -f1)
343
+ if [[ -n "$AUDIO_SIZE" ]] && [[ "$AUDIO_SIZE" -gt 100 ]]; then
344
+ echo -e "\033[0;31m⚠️ Audio cache is ${AUDIO_SIZE}MB - Run: /agent-vibes:cleanup\033[0m"
345
+ fi
346
+ fi
347
+
348
+ # Background music status
349
+ if [[ -z "$BACKGROUND_MUSIC" ]]; then
350
+ BACKGROUND_ENABLED_FILE="$PROJECT_ROOT/.claude/config/background-music-enabled.txt"
351
+ if [[ -f "$BACKGROUND_ENABLED_FILE" ]] && grep -q "true" "$BACKGROUND_ENABLED_FILE" 2>/dev/null; then
352
+ echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}Enabled but not playing (check config)${NC}"
353
+ else
354
+ echo -e "${WHITE}🎵 Background music:${NC} ${PURPLE}Disabled${NC}"
355
+ fi
356
+ fi