smart_prompt 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -2
- data/README.cn.md +55 -4
- data/README.md +55 -4
- data/docs/ANTHROPIC_EXAMPLES.md +559 -0
- data/docs/CONVERSATION_INTEGRATION_SUMMARY.md +155 -0
- data/docs/HISTORY_EXAMPLES_README.md +533 -0
- data/docs/HISTORY_MANAGEMENT_GUIDE.md +797 -0
- data/docs/MONITORING_GUIDE.md +278 -0
- data/docs/MULTIMODAL_README.md +265 -0
- data/docs/RELEVANCE_BASED_STRATEGY_IMPLEMENTATION.md +124 -0
- data/docs/STT_README.md +302 -0
- data/docs/TTS_README.md +303 -0
- data/docs/VIDEO_GENERATION_README.md +246 -0
- data/docs/delete_files_list.md +124 -0
- data/lib/smart_prompt/anthropic_adapter.rb +167 -140
- data/lib/smart_prompt/conversation.rb +195 -42
- data/lib/smart_prompt/engine.rb +20 -10
- data/lib/smart_prompt/openai_adapter.rb +25 -1
- data/lib/smart_prompt/version.rb +1 -1
- data/lib/smart_prompt/worker.rb +5 -2
- data/lib/smart_prompt.rb +2 -1
- metadata +33 -22
data/docs/STT_README.md
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# SmartPrompt STT Guide
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the new Speech-to-Text (STT) capabilities in SmartPrompt.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The STT feature adds support for:
|
|
8
|
+
- **Speech-to-Text Transcription**: Convert audio files to text
|
|
9
|
+
- **URL-based Transcription**: Transcribe audio from URLs
|
|
10
|
+
- **Multi-language Support**: Chinese, English, Japanese, Korean
|
|
11
|
+
- **Batch Processing**: Process multiple audio files efficiently
|
|
12
|
+
- **Language Detection**: Automatically detect language from audio
|
|
13
|
+
- **Multiple Formats**: JSON, text, SRT, VTT output formats
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Make sure you have the required dependencies:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
gem install openai
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Configuration
|
|
24
|
+
|
|
25
|
+
Add the STT adapter to your configuration:
|
|
26
|
+
|
|
27
|
+
```yaml
|
|
28
|
+
# config.yml
|
|
29
|
+
adapters:
|
|
30
|
+
multimodal: "MultimodalAdapter"
|
|
31
|
+
image_generation: "ImageGenerationAdapter"
|
|
32
|
+
video_generation: "VideoGenerationAdapter"
|
|
33
|
+
tts: "TTSAdapter"
|
|
34
|
+
stt: "STTAdapter"
|
|
35
|
+
|
|
36
|
+
llms:
|
|
37
|
+
stt_service:
|
|
38
|
+
adapter: "stt"
|
|
39
|
+
url: "https://api.siliconflow.cn/v1/"
|
|
40
|
+
api_key: "ENV[SILICONFLOW_API_KEY]"
|
|
41
|
+
model: "FunAudioLLM/CosyVoice2-0.5B"
|
|
42
|
+
|
|
43
|
+
default_llm: "qwen_vl"
|
|
44
|
+
template_path: "./templates"
|
|
45
|
+
worker_path: "./workers"
|
|
46
|
+
logger_file: "./logs/smart_prompt.log"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Available Workers
|
|
50
|
+
|
|
51
|
+
### 1. STT Transcriber Worker
|
|
52
|
+
Basic speech-to-text transcription.
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
result = engine.call_worker(:stt_transcriber, {
|
|
56
|
+
audio_file: "./audio.wav",
|
|
57
|
+
language: "zh", # Optional: "zh", "en", "ja", "ko"
|
|
58
|
+
prompt: "专业术语", # Optional: Context prompt
|
|
59
|
+
temperature: 0.0, # Optional: 0.0 to 1.0
|
|
60
|
+
response_format: "json" # Optional: "json", "text", "srt", "vtt"
|
|
61
|
+
})
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 2. STT URL Transcriber Worker
|
|
65
|
+
Transcribe audio from URL.
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
result = engine.call_worker(:stt_url_transcriber, {
|
|
69
|
+
audio_url: "https://example.com/audio.wav",
|
|
70
|
+
language: "en",
|
|
71
|
+
response_format: "text"
|
|
72
|
+
})
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### 3. Batch STT Worker
|
|
76
|
+
Process multiple audio files.
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
result = engine.call_worker(:batch_stt, {
|
|
80
|
+
audio_files: [
|
|
81
|
+
"./audio1.wav",
|
|
82
|
+
"./audio2.mp3",
|
|
83
|
+
"./audio3.webm"
|
|
84
|
+
],
|
|
85
|
+
language: "zh",
|
|
86
|
+
temperature: 0.0
|
|
87
|
+
})
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 4. Audio Info Worker
|
|
91
|
+
Get audio file information.
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
result = engine.call_worker(:audio_info, {
|
|
95
|
+
audio_file: "./audio.wav"
|
|
96
|
+
})
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 5. Language Detector Worker
|
|
100
|
+
Detect language from audio or text.
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# From audio file
|
|
104
|
+
result = engine.call_worker(:language_detector, {
|
|
105
|
+
audio_file: "./audio.wav"
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
# From text
|
|
109
|
+
result = engine.call_worker(:language_detector, {
|
|
110
|
+
text: "这是一个中文文本"
|
|
111
|
+
})
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### 6. Multi-language STT Worker
|
|
115
|
+
Automatic language detection and transcription.
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
result = engine.call_worker(:multilingual_stt, {
|
|
119
|
+
audio_file: "./audio.wav"
|
|
120
|
+
})
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 7. STT Format Converter Worker
|
|
124
|
+
Generate transcriptions in multiple formats.
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
result = engine.call_worker(:stt_format_converter, {
|
|
128
|
+
audio_file: "./audio.wav",
|
|
129
|
+
formats: ["json", "text", "srt", "vtt"]
|
|
130
|
+
})
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Direct Adapter Usage
|
|
134
|
+
|
|
135
|
+
You can also use the adapter directly without workers:
|
|
136
|
+
|
|
137
|
+
```ruby
|
|
138
|
+
# Get the adapter
|
|
139
|
+
adapter = engine.llms["stt_service"]
|
|
140
|
+
|
|
141
|
+
# Transcribe audio file
|
|
142
|
+
transcription_data = adapter.transcribe_audio(
|
|
143
|
+
"./audio.wav",
|
|
144
|
+
language: "zh",
|
|
145
|
+
temperature: 0.0,
|
|
146
|
+
response_format: "json"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Transcribe from URL
|
|
150
|
+
transcription_data = adapter.transcribe_audio_url(
|
|
151
|
+
"https://example.com/audio.wav",
|
|
152
|
+
language: "en",
|
|
153
|
+
response_format: "text"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Batch transcription
|
|
157
|
+
batch_result = adapter.transcribe_batch(
|
|
158
|
+
["./audio1.wav", "./audio2.mp3"],
|
|
159
|
+
language: "zh"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Get audio information
|
|
163
|
+
audio_info = adapter.get_audio_info("./audio.wav")
|
|
164
|
+
|
|
165
|
+
# Detect language
|
|
166
|
+
detected_language = adapter.detect_language("这是一个中文文本")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Response Formats
|
|
170
|
+
|
|
171
|
+
### Transcription Response
|
|
172
|
+
```ruby
|
|
173
|
+
{
|
|
174
|
+
text: "转录的文本内容", # Transcribed text
|
|
175
|
+
language: "zh", # Language used
|
|
176
|
+
duration: 120, # Audio duration in seconds
|
|
177
|
+
file_size: 1024000, # File size in bytes
|
|
178
|
+
format: "wav" # Audio format
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Batch Response
|
|
183
|
+
```ruby
|
|
184
|
+
{
|
|
185
|
+
total_files: 3, # Total files processed
|
|
186
|
+
successful: 2, # Successful transcriptions
|
|
187
|
+
failed: 1, # Failed transcriptions
|
|
188
|
+
results: [ # Individual results
|
|
189
|
+
{
|
|
190
|
+
file: "./audio1.wav",
|
|
191
|
+
index: 0,
|
|
192
|
+
transcription: { ... },
|
|
193
|
+
success: true
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
file: "./audio2.wav",
|
|
197
|
+
index: 1,
|
|
198
|
+
error: "File not found",
|
|
199
|
+
success: false
|
|
200
|
+
}
|
|
201
|
+
]
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Audio Information Response
|
|
206
|
+
```ruby
|
|
207
|
+
{
|
|
208
|
+
file_path: "./audio.wav", # File path
|
|
209
|
+
file_name: "audio.wav", # File name
|
|
210
|
+
file_size: 1024000, # File size in bytes
|
|
211
|
+
format: "wav", # Audio format
|
|
212
|
+
estimated_duration: 120, # Estimated duration in seconds
|
|
213
|
+
supported: true # Whether format is supported
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Supported Models
|
|
218
|
+
|
|
219
|
+
SiliconFlow supports various STT models:
|
|
220
|
+
- `FunAudioLLM/CosyVoice2-0.5B` - Multi-language speech recognition
|
|
221
|
+
- `fnlp/MOSS-TTSD-v0.5` - High accuracy speech recognition
|
|
222
|
+
|
|
223
|
+
## Supported Audio Formats
|
|
224
|
+
|
|
225
|
+
- `mp3` - MP3 format
|
|
226
|
+
- `mp4` - MP4 format
|
|
227
|
+
- `mpeg` - MPEG format
|
|
228
|
+
- `mpga` - MPGA format
|
|
229
|
+
- `m4a` - M4A format
|
|
230
|
+
- `wav` - WAV format
|
|
231
|
+
- `webm` - WebM format
|
|
232
|
+
|
|
233
|
+
## Language Support
|
|
234
|
+
|
|
235
|
+
- `zh` - Chinese
|
|
236
|
+
- `en` - English
|
|
237
|
+
- `ja` - Japanese
|
|
238
|
+
- `ko` - Korean
|
|
239
|
+
|
|
240
|
+
## Response Formats
|
|
241
|
+
|
|
242
|
+
- `json` - JSON format (default)
|
|
243
|
+
- `text` - Plain text format
|
|
244
|
+
- `srt` - SubRip subtitle format
|
|
245
|
+
- `vtt` - WebVTT subtitle format
|
|
246
|
+
|
|
247
|
+
## File Size Limits
|
|
248
|
+
|
|
249
|
+
- **Maximum file size**: 25MB
|
|
250
|
+
- **Recommended duration**: Under 30 minutes
|
|
251
|
+
- **Bitrate**: Standard audio bitrates
|
|
252
|
+
|
|
253
|
+
## Error Handling
|
|
254
|
+
|
|
255
|
+
```ruby
|
|
256
|
+
begin
|
|
257
|
+
result = engine.call_worker(:stt_transcriber, params)
|
|
258
|
+
rescue SmartPrompt::LLMAPIError => e
|
|
259
|
+
puts "API Error: #{e.message}"
|
|
260
|
+
rescue SmartPrompt::Error => e
|
|
261
|
+
puts "General Error: #{e.message}"
|
|
262
|
+
rescue => e
|
|
263
|
+
puts "Unexpected Error: #{e.message}"
|
|
264
|
+
end
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## Best Practices
|
|
268
|
+
|
|
269
|
+
1. **Audio Quality**: Use clear audio with minimal background noise
|
|
270
|
+
2. **Language Specification**: Specify language for better accuracy
|
|
271
|
+
3. **File Size**: Keep files under 25MB for optimal performance
|
|
272
|
+
4. **Batch Processing**: Use batch workers for multiple files
|
|
273
|
+
5. **Format Selection**: Choose appropriate response format for your use case
|
|
274
|
+
6. **Temperature**: Use lower temperature (0.0-0.2) for more accurate transcriptions
|
|
275
|
+
|
|
276
|
+
## Example
|
|
277
|
+
|
|
278
|
+
See `examples/stt_example.rb` for complete working examples.
|
|
279
|
+
|
|
280
|
+
## Troubleshooting
|
|
281
|
+
|
|
282
|
+
**Common Issues:**
|
|
283
|
+
- **API Key Error**: Ensure `SILICONFLOW_API_KEY` environment variable is set
|
|
284
|
+
- **File Not Found**: Check file path and permissions
|
|
285
|
+
- **Unsupported Format**: Use only supported audio formats
|
|
286
|
+
- **File Too Large**: Maximum file size is 25MB
|
|
287
|
+
- **Network Error**: Check internet connection and API endpoint
|
|
288
|
+
|
|
289
|
+
**Error Messages:**
|
|
290
|
+
- `Audio file not found` - Check file path
|
|
291
|
+
- `Unsupported audio format` - Use supported formats only
|
|
292
|
+
- `Audio file too large` - Reduce file size to under 25MB
|
|
293
|
+
- `Unsupported response format` - Use only supported response formats
|
|
294
|
+
- `Network error: Unable to connect to STT API` - Check network connectivity
|
|
295
|
+
|
|
296
|
+
## Performance Tips
|
|
297
|
+
|
|
298
|
+
1. **Preprocessing**: Normalize audio levels before transcription
|
|
299
|
+
2. **Language Detection**: Use automatic detection for mixed-language content
|
|
300
|
+
3. **Batch Processing**: Process multiple files together for efficiency
|
|
301
|
+
4. **Format Selection**: Use JSON for structured data, text for simple output
|
|
302
|
+
5. **Error Recovery**: Implement retry logic for network failures
|
data/docs/TTS_README.md
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# SmartPrompt TTS Guide
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the new Text-to-Speech (TTS) capabilities in SmartPrompt.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The TTS feature adds support for:
|
|
8
|
+
- **Text-to-Speech Synthesis**: Convert text to natural-sounding speech
|
|
9
|
+
- **Multi-language Support**: Chinese, English, Japanese, Korean
|
|
10
|
+
- **Voice Selection**: Multiple predefined voices and custom voices
|
|
11
|
+
- **Speed Control**: Adjust speech speed from 0.25x to 4.0x
|
|
12
|
+
- **Multiple Formats**: MP3, WAV, Opus, PCM output formats
|
|
13
|
+
- **Custom Voices**: Create and manage custom voices from reference audio
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Make sure you have the required dependencies:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
gem install openai
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Configuration
|
|
24
|
+
|
|
25
|
+
Add the TTS adapter to your configuration:
|
|
26
|
+
|
|
27
|
+
```yaml
|
|
28
|
+
# config.yml
|
|
29
|
+
adapters:
|
|
30
|
+
multimodal: "MultimodalAdapter"
|
|
31
|
+
image_generation: "ImageGenerationAdapter"
|
|
32
|
+
video_generation: "VideoGenerationAdapter"
|
|
33
|
+
tts: "TTSAdapter"
|
|
34
|
+
|
|
35
|
+
llms:
|
|
36
|
+
tts_service:
|
|
37
|
+
adapter: "tts"
|
|
38
|
+
url: "https://api.siliconflow.cn/v1/"
|
|
39
|
+
api_key: "ENV[SILICONFLOW_API_KEY]"
|
|
40
|
+
model: "FunAudioLLM/CosyVoice2-0.5B"
|
|
41
|
+
|
|
42
|
+
default_llm: "tts_service"
|
|
43
|
+
template_path: "./templates"
|
|
44
|
+
worker_path: "./workers"
|
|
45
|
+
logger_file: "./logs/smart_prompt.log"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Available Workers
|
|
49
|
+
|
|
50
|
+
### 1. TTS Synthesizer Worker
|
|
51
|
+
Basic text-to-speech synthesis.
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
result = engine.call_worker(:tts_synthesizer, {
|
|
55
|
+
text: "欢迎使用智能提示系统",
|
|
56
|
+
voice: "alloy", # Optional: "alloy", "echo", "fable", "onyx", "nova", "shimmer"
|
|
57
|
+
speed: 1.0, # Optional: 0.25 to 4.0
|
|
58
|
+
response_format: "mp3", # Optional: "mp3", "wav", "opus", "pcm"
|
|
59
|
+
language: "zh", # Optional: "zh", "en", "ja", "ko"
|
|
60
|
+
save_to_file: true, # Optional: Save audio to file
|
|
61
|
+
output_dir: "./audio", # Optional: Output directory
|
|
62
|
+
filename_prefix: "tts" # Optional: Filename prefix
|
|
63
|
+
})
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Multilingual TTS Worker
|
|
67
|
+
Automatic language detection and synthesis.
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
result = engine.call_worker(:multilingual_tts, {
|
|
71
|
+
text: "Hello, this is a demonstration",
|
|
72
|
+
voice: "echo",
|
|
73
|
+
save_to_file: true
|
|
74
|
+
})
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 3. Voice Selector Worker
|
|
78
|
+
List available voices and test different voices.
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
result = engine.call_worker(:voice_selector, {
|
|
82
|
+
text: "测试不同音色的效果",
|
|
83
|
+
voice: "nova",
|
|
84
|
+
save_to_file: true
|
|
85
|
+
})
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 4. Speed Variation Worker
|
|
89
|
+
Generate audio at different speeds.
|
|
90
|
+
|
|
91
|
+
```ruby
|
|
92
|
+
result = engine.call_worker(:speed_variation_tts, {
|
|
93
|
+
text: "这是一个语速变化的演示",
|
|
94
|
+
speeds: [0.5, 0.75, 1.0, 1.5, 2.0],
|
|
95
|
+
save_to_file: true
|
|
96
|
+
})
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 5. Custom Voice Manager Worker
|
|
100
|
+
Manage custom voices (create, list, delete, synthesize).
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# List voices
|
|
104
|
+
result = engine.call_worker(:custom_voice_manager, {
|
|
105
|
+
action: "list"
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
# Create custom voice
|
|
109
|
+
result = engine.call_worker(:custom_voice_manager, {
|
|
110
|
+
action: "create",
|
|
111
|
+
name: "my_voice",
|
|
112
|
+
reference_audio_file: "./reference.wav",
|
|
113
|
+
description: "My custom voice"
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
# Delete custom voice
|
|
117
|
+
result = engine.call_worker(:custom_voice_manager, {
|
|
118
|
+
action: "delete",
|
|
119
|
+
voice_id: "voice_123"
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
# Synthesize with custom voice
|
|
123
|
+
result = engine.call_worker(:custom_voice_manager, {
|
|
124
|
+
action: "synthesize",
|
|
125
|
+
voice_id: "voice_123",
|
|
126
|
+
text: "使用自定义音色朗读",
|
|
127
|
+
save_to_file: true
|
|
128
|
+
})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 6. Batch TTS Worker
|
|
132
|
+
Process multiple texts in batch.
|
|
133
|
+
|
|
134
|
+
```ruby
|
|
135
|
+
result = engine.call_worker(:batch_tts, {
|
|
136
|
+
texts: [
|
|
137
|
+
"第一条文本",
|
|
138
|
+
"第二条文本",
|
|
139
|
+
"第三条文本"
|
|
140
|
+
],
|
|
141
|
+
voice: "alloy",
|
|
142
|
+
save_to_file: true
|
|
143
|
+
})
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Direct Adapter Usage
|
|
147
|
+
|
|
148
|
+
You can also use the adapter directly without workers:
|
|
149
|
+
|
|
150
|
+
```ruby
|
|
151
|
+
# Get the adapter
|
|
152
|
+
adapter = engine.llms["tts_service"]
|
|
153
|
+
|
|
154
|
+
# Synthesize speech
|
|
155
|
+
audio_data = adapter.synthesize_speech(
|
|
156
|
+
"这是一个直接合成的演示",
|
|
157
|
+
voice: "echo",
|
|
158
|
+
speed: 1.2,
|
|
159
|
+
response_format: "mp3"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Synthesize and save to file
|
|
163
|
+
result = adapter.synthesize_to_file(
|
|
164
|
+
"保存到文件的演示",
|
|
165
|
+
"./audio/demo.mp3",
|
|
166
|
+
voice: "nova",
|
|
167
|
+
speed: 1.0
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Get available voices
|
|
171
|
+
voices = adapter.available_voices
|
|
172
|
+
|
|
173
|
+
# Create custom voice
|
|
174
|
+
voice_data = adapter.create_custom_voice(
|
|
175
|
+
"my_voice",
|
|
176
|
+
"./reference.wav",
|
|
177
|
+
description: "My custom voice"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# List custom voices
|
|
181
|
+
custom_voices = adapter.list_custom_voices
|
|
182
|
+
|
|
183
|
+
# Delete custom voice
|
|
184
|
+
result = adapter.delete_custom_voice("voice_123")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Response Formats
|
|
188
|
+
|
|
189
|
+
### Audio Data Response
|
|
190
|
+
```ruby
|
|
191
|
+
{
|
|
192
|
+
audio_data: "data:audio/mp3;base64,...", # Base64 encoded audio
|
|
193
|
+
format: "mp3", # Audio format
|
|
194
|
+
text_length: 25, # Input text length
|
|
195
|
+
voice: "alloy" # Voice used
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### File Response
|
|
200
|
+
```ruby
|
|
201
|
+
{
|
|
202
|
+
file_path: "./audio/demo.mp3", # Saved file path
|
|
203
|
+
text_length: 25, # Input text length
|
|
204
|
+
voice: "alloy", # Voice used
|
|
205
|
+
format: "mp3" # Audio format
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Voice Management Response
|
|
210
|
+
```ruby
|
|
211
|
+
{
|
|
212
|
+
voice_id: "voice_123", # Voice identifier
|
|
213
|
+
name: "my_voice", # Voice name
|
|
214
|
+
status: "active", # Voice status
|
|
215
|
+
created_at: "2024-01-01..." # Creation timestamp
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Supported Models
|
|
220
|
+
|
|
221
|
+
SiliconFlow supports various TTS models:
|
|
222
|
+
- `FunAudioLLM/CosyVoice2-0.5B` - Multi-language support with emotion control
|
|
223
|
+
- `fnlp/MOSS-TTSD-v0.5` - High expressiveness, dual voice cloning
|
|
224
|
+
|
|
225
|
+
## Predefined Voices
|
|
226
|
+
|
|
227
|
+
- `alloy` - 沉稳男声alex
|
|
228
|
+
- `echo` - 温柔女声claire
|
|
229
|
+
- `fable` - 活泼女声fable
|
|
230
|
+
- `onyx` - 磁性男声onyx
|
|
231
|
+
- `nova` - 甜美女声nova
|
|
232
|
+
- `shimmer` - 优雅女声shimmer
|
|
233
|
+
|
|
234
|
+
## Language Support
|
|
235
|
+
|
|
236
|
+
- `zh` - Chinese
|
|
237
|
+
- `en` - English
|
|
238
|
+
- `ja` - Japanese
|
|
239
|
+
- `ko` - Korean
|
|
240
|
+
|
|
241
|
+
## Audio Formats
|
|
242
|
+
|
|
243
|
+
- `mp3` - MP3 format (default)
|
|
244
|
+
- `wav` - WAV format
|
|
245
|
+
- `opus` - Opus format
|
|
246
|
+
- `pcm` - PCM format
|
|
247
|
+
|
|
248
|
+
## Speed Control
|
|
249
|
+
|
|
250
|
+
- **Range**: 0.25 to 4.0
|
|
251
|
+
- **Default**: 1.0 (normal speed)
|
|
252
|
+
- **Slow**: 0.25 - 0.75
|
|
253
|
+
- **Fast**: 1.25 - 4.0
|
|
254
|
+
|
|
255
|
+
## Custom Voice Requirements
|
|
256
|
+
|
|
257
|
+
- **Reference Audio**: 8-10 seconds recommended
|
|
258
|
+
- **Audio Quality**: Clear speech, no background noise
|
|
259
|
+
- **File Size**: Maximum 5MB
|
|
260
|
+
- **Formats**: Common audio formats supported
|
|
261
|
+
|
|
262
|
+
## Error Handling
|
|
263
|
+
|
|
264
|
+
```ruby
|
|
265
|
+
begin
|
|
266
|
+
result = engine.call_worker(:tts_synthesizer, params)
|
|
267
|
+
rescue SmartPrompt::LLMAPIError => e
|
|
268
|
+
puts "API Error: #{e.message}"
|
|
269
|
+
rescue SmartPrompt::Error => e
|
|
270
|
+
puts "General Error: #{e.message}"
|
|
271
|
+
rescue => e
|
|
272
|
+
puts "Unexpected Error: #{e.message}"
|
|
273
|
+
end
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## Best Practices
|
|
277
|
+
|
|
278
|
+
1. **Text Preparation**: Remove unnecessary spaces, use proper punctuation
|
|
279
|
+
2. **Language Selection**: Specify language for better pronunciation
|
|
280
|
+
3. **Speed Adjustment**: Use 0.8-1.2 for natural speech
|
|
281
|
+
4. **Voice Selection**: Test different voices for your use case
|
|
282
|
+
5. **Batch Processing**: Use batch workers for multiple texts
|
|
283
|
+
6. **File Management**: Use `save_to_file: true` for persistent storage
|
|
284
|
+
|
|
285
|
+
## Example
|
|
286
|
+
|
|
287
|
+
See `examples/tts_example.rb` for complete working examples.
|
|
288
|
+
|
|
289
|
+
## Troubleshooting
|
|
290
|
+
|
|
291
|
+
**Common Issues:**
|
|
292
|
+
- **API Key Error**: Ensure `SILICONFLOW_API_KEY` environment variable is set
|
|
293
|
+
- **Text Too Long**: Maximum 4096 characters per request
|
|
294
|
+
- **Invalid Voice**: Use only predefined voice names or valid custom voice IDs
|
|
295
|
+
- **Speed Out of Range**: Speed must be between 0.25 and 4.0
|
|
296
|
+
- **File Permissions**: Ensure write permissions for output directories
|
|
297
|
+
- **Reference Audio**: For custom voices, use clear 8-10 second audio files
|
|
298
|
+
|
|
299
|
+
**Error Messages:**
|
|
300
|
+
- `Text cannot be empty` - Provide non-empty text
|
|
301
|
+
- `Text too long` - Reduce text length to under 4096 characters
|
|
302
|
+
- `Unsupported response format` - Use only supported audio formats
|
|
303
|
+
- `Reference audio file not found` - Check file path for custom voice creation
|