smart_prompt 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -10
- data/README.cn.md +307 -64
- data/README.md +311 -64
- data/Rakefile +10 -1
- data/config/anthropic_config.yml +151 -0
- data/config/image_generation_config.yml +22 -0
- data/config/multimodal_config.yml +85 -0
- data/config/sensenova_config.yml +63 -0
- data/config/zhipu_config.yml +73 -0
- data/examples/anthropic_basic_chat.rb +143 -0
- data/examples/anthropic_example.rb +232 -0
- data/examples/anthropic_multimodal.rb +212 -0
- data/examples/anthropic_streaming.rb +312 -0
- data/examples/anthropic_tool_calling.rb +393 -0
- data/examples/automatic_cleanup_example.rb +109 -0
- data/examples/history_management_examples.rb +522 -0
- data/examples/image_generation_example.rb +130 -0
- data/examples/monitoring_example.rb +121 -0
- data/examples/multimodal_example.rb +63 -0
- data/examples/relevance_based_strategy_example.rb +87 -0
- data/examples/sensenova_example.rb +129 -0
- data/examples/stt_example.rb +287 -0
- data/examples/tts_example.rb +244 -0
- data/examples/video_generation_example.rb +189 -0
- data/examples/zhipu_example.rb +151 -0
- data/lib/smart_prompt/anthropic_adapter.rb +363 -281
- data/lib/smart_prompt/compression_engine.rb +201 -0
- data/lib/smart_prompt/context_strategy.rb +22 -0
- data/lib/smart_prompt/conversation.rb +81 -191
- data/lib/smart_prompt/engine.rb +36 -19
- data/lib/smart_prompt/history_manager.rb +596 -0
- data/lib/smart_prompt/hybrid_strategy.rb +222 -0
- data/lib/smart_prompt/image_generation_adapter.rb +297 -0
- data/lib/smart_prompt/lru_cache.rb +133 -0
- data/lib/smart_prompt/message.rb +57 -0
- data/lib/smart_prompt/multimodal_adapter.rb +277 -0
- data/lib/smart_prompt/openai_adapter.rb +1 -25
- data/lib/smart_prompt/persistence_layer.rb +197 -0
- data/lib/smart_prompt/relevance_based_strategy.rb +221 -0
- data/lib/smart_prompt/sensenova_adapter.rb +410 -0
- data/lib/smart_prompt/session.rb +140 -0
- data/lib/smart_prompt/sliding_window_strategy.rb +100 -0
- data/lib/smart_prompt/stt_adapter.rb +381 -0
- data/lib/smart_prompt/summary_based_strategy.rb +152 -0
- data/lib/smart_prompt/token_counter.rb +74 -0
- data/lib/smart_prompt/tts_adapter.rb +403 -0
- data/lib/smart_prompt/version.rb +1 -1
- data/lib/smart_prompt/video_generation_adapter.rb +330 -0
- data/lib/smart_prompt/worker.rb +25 -3
- data/lib/smart_prompt/zhipu_adapter.rb +616 -0
- data/lib/smart_prompt.rb +22 -2
- data/workers/history_management_examples.rb +407 -0
- data/workers/image_generation_workers.rb +119 -0
- data/workers/multimodal_workers.rb +110 -0
- data/workers/sensenova_workers.rb +62 -0
- data/workers/stt_workers.rb +195 -0
- data/workers/tts_workers.rb +388 -0
- data/workers/video_generation_workers.rb +264 -0
- data/workers/zhipu_workers.rb +113 -0
- metadata +84 -8
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# STT Workers for SmartPrompt
|
|
2
|
+
# These workers demonstrate the new speech-to-text capabilities
|
|
3
|
+
|
|
4
|
+
# Basic speech-to-text worker
|
|
5
|
+
SmartPrompt.define_worker :stt_transcriber do
|
|
6
|
+
use "stt_service"
|
|
7
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
8
|
+
|
|
9
|
+
# Prepare parameters for STT transcription
|
|
10
|
+
stt_params = {
|
|
11
|
+
audio_file: params[:audio_file],
|
|
12
|
+
language: params[:language],
|
|
13
|
+
prompt: params[:prompt],
|
|
14
|
+
temperature: params[:temperature] || 0.0,
|
|
15
|
+
response_format: params[:response_format] || "json"
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Call the STT adapter directly
|
|
19
|
+
adapter = engine.llms["stt_service"]
|
|
20
|
+
|
|
21
|
+
# Transcribe audio
|
|
22
|
+
transcription_data = adapter.transcribe_audio(
|
|
23
|
+
stt_params[:audio_file],
|
|
24
|
+
model: params[:model],
|
|
25
|
+
language: stt_params[:language],
|
|
26
|
+
prompt: stt_params[:prompt],
|
|
27
|
+
temperature: stt_params[:temperature],
|
|
28
|
+
response_format: stt_params[:response_format]
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
{ transcription: transcription_data }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# URL-based speech-to-text worker
|
|
35
|
+
SmartPrompt.define_worker :stt_url_transcriber do
|
|
36
|
+
use "stt_service"
|
|
37
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
38
|
+
|
|
39
|
+
# Prepare parameters for URL-based STT
|
|
40
|
+
stt_params = {
|
|
41
|
+
audio_url: params[:audio_url],
|
|
42
|
+
language: params[:language],
|
|
43
|
+
prompt: params[:prompt],
|
|
44
|
+
temperature: params[:temperature] || 0.0,
|
|
45
|
+
response_format: params[:response_format] || "json"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
adapter = engine.llms["stt_service"]
|
|
49
|
+
|
|
50
|
+
# Transcribe audio from URL
|
|
51
|
+
transcription_data = adapter.transcribe_audio_url(
|
|
52
|
+
stt_params[:audio_url],
|
|
53
|
+
model: params[:model],
|
|
54
|
+
language: stt_params[:language],
|
|
55
|
+
prompt: stt_params[:prompt],
|
|
56
|
+
temperature: stt_params[:temperature],
|
|
57
|
+
response_format: stt_params[:response_format]
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
{ transcription: transcription_data }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Batch speech-to-text worker
|
|
64
|
+
SmartPrompt.define_worker :batch_stt do
|
|
65
|
+
use "stt_service"
|
|
66
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
67
|
+
|
|
68
|
+
# Process multiple audio files
|
|
69
|
+
audio_files = params[:audio_files] || [params[:audio_file]]
|
|
70
|
+
|
|
71
|
+
adapter = engine.llms["stt_service"]
|
|
72
|
+
|
|
73
|
+
# Batch transcribe audio files
|
|
74
|
+
batch_result = adapter.transcribe_batch(
|
|
75
|
+
audio_files,
|
|
76
|
+
model: params[:model],
|
|
77
|
+
language: params[:language],
|
|
78
|
+
prompt: params[:prompt],
|
|
79
|
+
temperature: params[:temperature] || 0.0
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
{ batch_result: batch_result }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Audio file information worker
|
|
86
|
+
SmartPrompt.define_worker :audio_info do
|
|
87
|
+
use "stt_service"
|
|
88
|
+
|
|
89
|
+
adapter = engine.llms["stt_service"]
|
|
90
|
+
|
|
91
|
+
# Get audio file information
|
|
92
|
+
audio_info = adapter.get_audio_info(params[:audio_file])
|
|
93
|
+
|
|
94
|
+
{ audio_info: audio_info }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Language detection worker
|
|
98
|
+
SmartPrompt.define_worker :language_detector do
|
|
99
|
+
use "stt_service"
|
|
100
|
+
|
|
101
|
+
adapter = engine.llms["stt_service"]
|
|
102
|
+
|
|
103
|
+
if params[:audio_file]
|
|
104
|
+
# Transcribe and detect language
|
|
105
|
+
transcription_data = adapter.transcribe_audio(
|
|
106
|
+
params[:audio_file],
|
|
107
|
+
model: params[:model],
|
|
108
|
+
language: params[:language],
|
|
109
|
+
temperature: params[:temperature] || 0.0
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Detect language from transcribed text
|
|
113
|
+
detected_language = adapter.detect_language(transcription_data[:text])
|
|
114
|
+
|
|
115
|
+
{
|
|
116
|
+
transcription: transcription_data,
|
|
117
|
+
detected_language: detected_language
|
|
118
|
+
}
|
|
119
|
+
elsif params[:text]
|
|
120
|
+
# Detect language from text directly
|
|
121
|
+
detected_language = adapter.detect_language(params[:text])
|
|
122
|
+
|
|
123
|
+
{
|
|
124
|
+
text: params[:text],
|
|
125
|
+
detected_language: detected_language
|
|
126
|
+
}
|
|
127
|
+
else
|
|
128
|
+
{ error: "Either audio_file or text parameter is required" }
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Multi-language STT worker
|
|
133
|
+
SmartPrompt.define_worker :multilingual_stt do
|
|
134
|
+
use "stt_service"
|
|
135
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
136
|
+
|
|
137
|
+
adapter = engine.llms["stt_service"]
|
|
138
|
+
|
|
139
|
+
# First transcribe without language specification
|
|
140
|
+
transcription_data = adapter.transcribe_audio(
|
|
141
|
+
params[:audio_file],
|
|
142
|
+
model: params[:model],
|
|
143
|
+
temperature: params[:temperature] || 0.0
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Detect language from transcribed text
|
|
147
|
+
detected_language = adapter.detect_language(transcription_data[:text])
|
|
148
|
+
|
|
149
|
+
# Re-transcribe with detected language for better accuracy
|
|
150
|
+
if detected_language && detected_language != "en"
|
|
151
|
+
improved_transcription = adapter.transcribe_audio(
|
|
152
|
+
params[:audio_file],
|
|
153
|
+
model: params[:model],
|
|
154
|
+
language: detected_language,
|
|
155
|
+
temperature: params[:temperature] || 0.0
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
{
|
|
159
|
+
initial_transcription: transcription_data,
|
|
160
|
+
improved_transcription: improved_transcription,
|
|
161
|
+
detected_language: detected_language
|
|
162
|
+
}
|
|
163
|
+
else
|
|
164
|
+
{
|
|
165
|
+
transcription: transcription_data,
|
|
166
|
+
detected_language: detected_language
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Format conversion worker
|
|
172
|
+
SmartPrompt.define_worker :stt_format_converter do
|
|
173
|
+
use "stt_service"
|
|
174
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
175
|
+
|
|
176
|
+
adapter = engine.llms["stt_service"]
|
|
177
|
+
|
|
178
|
+
# Generate transcriptions in different formats
|
|
179
|
+
formats = params[:formats] || %w[json text srt vtt]
|
|
180
|
+
results = {}
|
|
181
|
+
|
|
182
|
+
formats.each do |format|
|
|
183
|
+
transcription_data = adapter.transcribe_audio(
|
|
184
|
+
params[:audio_file],
|
|
185
|
+
model: params[:model],
|
|
186
|
+
language: params[:language],
|
|
187
|
+
temperature: params[:temperature] || 0.0,
|
|
188
|
+
response_format: format
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
results[format] = transcription_data
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
{ format_results: results }
|
|
195
|
+
end
|
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
# TTS Workers for SmartPrompt
|
|
2
|
+
# These workers demonstrate the new text-to-speech capabilities
|
|
3
|
+
|
|
4
|
+
# Basic text-to-speech worker
|
|
5
|
+
SmartPrompt.define_worker :tts_synthesizer do
|
|
6
|
+
use "tts_service"
|
|
7
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
8
|
+
|
|
9
|
+
# Prepare parameters for TTS synthesis
|
|
10
|
+
tts_params = {
|
|
11
|
+
text: params[:text],
|
|
12
|
+
voice: params[:voice] || "alloy",
|
|
13
|
+
speed: params[:speed] || 1.0,
|
|
14
|
+
response_format: params[:response_format] || "mp3",
|
|
15
|
+
language: params[:language]
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Call the TTS adapter directly
|
|
19
|
+
adapter = engine.llms["tts_service"]
|
|
20
|
+
|
|
21
|
+
if params[:save_to_file]
|
|
22
|
+
# Synthesize and save to file
|
|
23
|
+
output_dir = params[:output_dir] || "./generated_audio"
|
|
24
|
+
filename_prefix = params[:filename_prefix] || "tts_audio"
|
|
25
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
26
|
+
|
|
27
|
+
result = adapter.synthesize_to_file(
|
|
28
|
+
tts_params[:text],
|
|
29
|
+
output_path,
|
|
30
|
+
voice: tts_params[:voice],
|
|
31
|
+
model: params[:model],
|
|
32
|
+
speed: tts_params[:speed],
|
|
33
|
+
response_format: tts_params[:response_format],
|
|
34
|
+
language: tts_params[:language]
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
{ audio_file: result }
|
|
38
|
+
else
|
|
39
|
+
# Synthesize and return audio data
|
|
40
|
+
audio_data = adapter.synthesize_speech(
|
|
41
|
+
tts_params[:text],
|
|
42
|
+
voice: tts_params[:voice],
|
|
43
|
+
model: params[:model],
|
|
44
|
+
speed: tts_params[:speed],
|
|
45
|
+
response_format: tts_params[:response_format],
|
|
46
|
+
language: tts_params[:language]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
{ audio_data: audio_data }
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Multi-language TTS worker
|
|
54
|
+
SmartPrompt.define_worker :multilingual_tts do
|
|
55
|
+
use "tts_service"
|
|
56
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
57
|
+
|
|
58
|
+
# Prepare parameters for multilingual TTS
|
|
59
|
+
tts_params = {
|
|
60
|
+
text: params[:text],
|
|
61
|
+
voice: params[:voice] || "alloy",
|
|
62
|
+
speed: params[:speed] || 1.0,
|
|
63
|
+
response_format: params[:response_format] || "mp3",
|
|
64
|
+
language: params[:language]
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Auto-detect language if not specified
|
|
68
|
+
unless tts_params[:language]
|
|
69
|
+
# Simple language detection based on text content
|
|
70
|
+
if params[:text] =~ /[\u4e00-\u9fff]/
|
|
71
|
+
tts_params[:language] = "zh"
|
|
72
|
+
elsif params[:text] =~ /[\u3040-\u309f\u30a0-\u30ff]/
|
|
73
|
+
tts_params[:language] = "ja"
|
|
74
|
+
elsif params[:text] =~ /[\uac00-\ud7af]/
|
|
75
|
+
tts_params[:language] = "ko"
|
|
76
|
+
else
|
|
77
|
+
tts_params[:language] = "en"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
adapter = engine.llms["tts_service"]
|
|
82
|
+
|
|
83
|
+
if params[:save_to_file]
|
|
84
|
+
output_dir = params[:output_dir] || "./multilingual_audio"
|
|
85
|
+
filename_prefix = params[:filename_prefix] || "multilingual_tts"
|
|
86
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{tts_params[:language]}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
87
|
+
|
|
88
|
+
result = adapter.synthesize_to_file(
|
|
89
|
+
tts_params[:text],
|
|
90
|
+
output_path,
|
|
91
|
+
voice: tts_params[:voice],
|
|
92
|
+
model: params[:model],
|
|
93
|
+
speed: tts_params[:speed],
|
|
94
|
+
response_format: tts_params[:response_format],
|
|
95
|
+
language: tts_params[:language]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
{
|
|
99
|
+
audio_file: result,
|
|
100
|
+
detected_language: tts_params[:language]
|
|
101
|
+
}
|
|
102
|
+
else
|
|
103
|
+
audio_data = adapter.synthesize_speech(
|
|
104
|
+
tts_params[:text],
|
|
105
|
+
voice: tts_params[:voice],
|
|
106
|
+
model: params[:model],
|
|
107
|
+
speed: tts_params[:speed],
|
|
108
|
+
response_format: tts_params[:response_format],
|
|
109
|
+
language: tts_params[:language]
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
{
|
|
113
|
+
audio_data: audio_data,
|
|
114
|
+
detected_language: tts_params[:language]
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Voice selection worker
|
|
120
|
+
SmartPrompt.define_worker :voice_selector do
|
|
121
|
+
use "tts_service"
|
|
122
|
+
|
|
123
|
+
adapter = engine.llms["tts_service"]
|
|
124
|
+
|
|
125
|
+
# Get available voices
|
|
126
|
+
available_voices = adapter.available_voices
|
|
127
|
+
|
|
128
|
+
# If voice parameter is provided, use it for synthesis
|
|
129
|
+
if params[:text]
|
|
130
|
+
tts_params = {
|
|
131
|
+
text: params[:text],
|
|
132
|
+
voice: params[:voice] || "alloy",
|
|
133
|
+
speed: params[:speed] || 1.0,
|
|
134
|
+
response_format: params[:response_format] || "mp3"
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if params[:save_to_file]
|
|
138
|
+
output_dir = params[:output_dir] || "./voice_samples"
|
|
139
|
+
filename_prefix = params[:filename_prefix] || "voice_sample"
|
|
140
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{tts_params[:voice]}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
141
|
+
|
|
142
|
+
result = adapter.synthesize_to_file(
|
|
143
|
+
tts_params[:text],
|
|
144
|
+
output_path,
|
|
145
|
+
voice: tts_params[:voice],
|
|
146
|
+
model: params[:model],
|
|
147
|
+
speed: tts_params[:speed],
|
|
148
|
+
response_format: tts_params[:response_format]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
{
|
|
152
|
+
available_voices: available_voices,
|
|
153
|
+
selected_voice: tts_params[:voice],
|
|
154
|
+
audio_file: result
|
|
155
|
+
}
|
|
156
|
+
else
|
|
157
|
+
audio_data = adapter.synthesize_speech(
|
|
158
|
+
tts_params[:text],
|
|
159
|
+
voice: tts_params[:voice],
|
|
160
|
+
model: params[:model],
|
|
161
|
+
speed: tts_params[:speed],
|
|
162
|
+
response_format: tts_params[:response_format]
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
{
|
|
166
|
+
available_voices: available_voices,
|
|
167
|
+
selected_voice: tts_params[:voice],
|
|
168
|
+
audio_data: audio_data
|
|
169
|
+
}
|
|
170
|
+
end
|
|
171
|
+
else
|
|
172
|
+
# Just return available voices
|
|
173
|
+
{ available_voices: available_voices }
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Speed variation worker
|
|
178
|
+
SmartPrompt.define_worker :speed_variation_tts do
|
|
179
|
+
use "tts_service"
|
|
180
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
181
|
+
|
|
182
|
+
results = []
|
|
183
|
+
|
|
184
|
+
# Generate audio at different speeds
|
|
185
|
+
speeds = params[:speeds] || [0.5, 0.75, 1.0, 1.25, 1.5, 2.0]
|
|
186
|
+
|
|
187
|
+
speeds.each do |speed|
|
|
188
|
+
tts_params = {
|
|
189
|
+
text: params[:text],
|
|
190
|
+
voice: params[:voice] || "alloy",
|
|
191
|
+
speed: speed,
|
|
192
|
+
response_format: params[:response_format] || "mp3"
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
adapter = engine.llms["tts_service"]
|
|
196
|
+
|
|
197
|
+
if params[:save_to_file]
|
|
198
|
+
output_dir = params[:output_dir] || "./speed_variations"
|
|
199
|
+
filename_prefix = params[:filename_prefix] || "speed_#{speed.to_s.gsub('.', '_')}"
|
|
200
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
201
|
+
|
|
202
|
+
result = adapter.synthesize_to_file(
|
|
203
|
+
tts_params[:text],
|
|
204
|
+
output_path,
|
|
205
|
+
voice: tts_params[:voice],
|
|
206
|
+
model: params[:model],
|
|
207
|
+
speed: tts_params[:speed],
|
|
208
|
+
response_format: tts_params[:response_format]
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
results << {
|
|
212
|
+
speed: speed,
|
|
213
|
+
audio_file: result
|
|
214
|
+
}
|
|
215
|
+
else
|
|
216
|
+
audio_data = adapter.synthesize_speech(
|
|
217
|
+
tts_params[:text],
|
|
218
|
+
voice: tts_params[:voice],
|
|
219
|
+
model: params[:model],
|
|
220
|
+
speed: tts_params[:speed],
|
|
221
|
+
response_format: tts_params[:response_format]
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
results << {
|
|
225
|
+
speed: speed,
|
|
226
|
+
audio_data: audio_data
|
|
227
|
+
}
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
{ speed_variations: results }
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Custom voice management worker
|
|
235
|
+
SmartPrompt.define_worker :custom_voice_manager do
|
|
236
|
+
use "tts_service"
|
|
237
|
+
|
|
238
|
+
adapter = engine.llms["tts_service"]
|
|
239
|
+
|
|
240
|
+
case params[:action]
|
|
241
|
+
when "list"
|
|
242
|
+
# List custom voices
|
|
243
|
+
custom_voices = adapter.list_custom_voices
|
|
244
|
+
{ action: "list", custom_voices: custom_voices }
|
|
245
|
+
|
|
246
|
+
when "create"
|
|
247
|
+
# Create custom voice
|
|
248
|
+
if params[:reference_audio_file]
|
|
249
|
+
voice_data = adapter.create_custom_voice(
|
|
250
|
+
params[:name],
|
|
251
|
+
params[:reference_audio_file],
|
|
252
|
+
description: params[:description]
|
|
253
|
+
)
|
|
254
|
+
{ action: "create", voice_data: voice_data }
|
|
255
|
+
else
|
|
256
|
+
{ error: "reference_audio_file is required for creating custom voice" }
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
when "delete"
|
|
260
|
+
# Delete custom voice
|
|
261
|
+
if params[:voice_id]
|
|
262
|
+
result = adapter.delete_custom_voice(params[:voice_id])
|
|
263
|
+
{ action: "delete", result: result }
|
|
264
|
+
else
|
|
265
|
+
{ error: "voice_id is required for deleting custom voice" }
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
when "synthesize"
|
|
269
|
+
# Synthesize using custom voice
|
|
270
|
+
if params[:voice_id] && params[:text]
|
|
271
|
+
tts_params = {
|
|
272
|
+
text: params[:text],
|
|
273
|
+
voice: params[:voice_id], # Use voice_id as custom voice
|
|
274
|
+
speed: params[:speed] || 1.0,
|
|
275
|
+
response_format: params[:response_format] || "mp3"
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if params[:save_to_file]
|
|
279
|
+
output_dir = params[:output_dir] || "./custom_voice_audio"
|
|
280
|
+
filename_prefix = params[:filename_prefix] || "custom_voice"
|
|
281
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{params[:voice_id]}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
282
|
+
|
|
283
|
+
result = adapter.synthesize_to_file(
|
|
284
|
+
tts_params[:text],
|
|
285
|
+
output_path,
|
|
286
|
+
voice: tts_params[:voice],
|
|
287
|
+
model: params[:model],
|
|
288
|
+
speed: tts_params[:speed],
|
|
289
|
+
response_format: tts_params[:response_format]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
{
|
|
293
|
+
action: "synthesize",
|
|
294
|
+
voice_id: params[:voice_id],
|
|
295
|
+
audio_file: result
|
|
296
|
+
}
|
|
297
|
+
else
|
|
298
|
+
audio_data = adapter.synthesize_speech(
|
|
299
|
+
tts_params[:text],
|
|
300
|
+
voice: tts_params[:voice],
|
|
301
|
+
model: params[:model],
|
|
302
|
+
speed: tts_params[:speed],
|
|
303
|
+
response_format: tts_params[:response_format]
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
{
|
|
307
|
+
action: "synthesize",
|
|
308
|
+
voice_id: params[:voice_id],
|
|
309
|
+
audio_data: audio_data
|
|
310
|
+
}
|
|
311
|
+
end
|
|
312
|
+
else
|
|
313
|
+
{ error: "voice_id and text are required for synthesis" }
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
else
|
|
317
|
+
# Default action: list voices
|
|
318
|
+
predefined_voices = adapter.available_voices
|
|
319
|
+
custom_voices = adapter.list_custom_voices
|
|
320
|
+
{
|
|
321
|
+
action: "default",
|
|
322
|
+
predefined_voices: predefined_voices,
|
|
323
|
+
custom_voices: custom_voices
|
|
324
|
+
}
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# Batch TTS worker for multiple texts
|
|
329
|
+
SmartPrompt.define_worker :batch_tts do
|
|
330
|
+
use "tts_service"
|
|
331
|
+
model "FunAudioLLM/CosyVoice2-0.5B"
|
|
332
|
+
|
|
333
|
+
results = []
|
|
334
|
+
|
|
335
|
+
# Process multiple texts
|
|
336
|
+
texts = params[:texts] || [params[:text]]
|
|
337
|
+
|
|
338
|
+
texts.each_with_index do |text, index|
|
|
339
|
+
tts_params = {
|
|
340
|
+
text: text,
|
|
341
|
+
voice: params[:voice] || "alloy",
|
|
342
|
+
speed: params[:speed] || 1.0,
|
|
343
|
+
response_format: params[:response_format] || "mp3",
|
|
344
|
+
language: params[:language]
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
adapter = engine.llms["tts_service"]
|
|
348
|
+
|
|
349
|
+
if params[:save_to_file]
|
|
350
|
+
output_dir = params[:output_dir] || "./batch_audio"
|
|
351
|
+
filename_prefix = params[:filename_prefix] || "batch_tts_#{index}"
|
|
352
|
+
output_path = File.join(output_dir, "#{filename_prefix}_#{Time.now.to_i}.#{tts_params[:response_format]}")
|
|
353
|
+
|
|
354
|
+
result = adapter.synthesize_to_file(
|
|
355
|
+
tts_params[:text],
|
|
356
|
+
output_path,
|
|
357
|
+
voice: tts_params[:voice],
|
|
358
|
+
model: params[:model],
|
|
359
|
+
speed: tts_params[:speed],
|
|
360
|
+
response_format: tts_params[:response_format],
|
|
361
|
+
language: tts_params[:language]
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
results << {
|
|
365
|
+
text: text,
|
|
366
|
+
index: index,
|
|
367
|
+
audio_file: result
|
|
368
|
+
}
|
|
369
|
+
else
|
|
370
|
+
audio_data = adapter.synthesize_speech(
|
|
371
|
+
tts_params[:text],
|
|
372
|
+
voice: tts_params[:voice],
|
|
373
|
+
model: params[:model],
|
|
374
|
+
speed: tts_params[:speed],
|
|
375
|
+
response_format: tts_params[:response_format],
|
|
376
|
+
language: tts_params[:language]
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
results << {
|
|
380
|
+
text: text,
|
|
381
|
+
index: index,
|
|
382
|
+
audio_data: audio_data
|
|
383
|
+
}
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
{ batch_results: results }
|
|
388
|
+
end
|