elevenlabs_client 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +52 -1
- data/README.md +78 -1
- data/lib/elevenlabs_client/client.rb +63 -1
- data/lib/elevenlabs_client/endpoints/audio_isolation.rb +71 -0
- data/lib/elevenlabs_client/endpoints/audio_native.rb +103 -0
- data/lib/elevenlabs_client/endpoints/dubs.rb +208 -2
- data/lib/elevenlabs_client/endpoints/forced_alignment.rb +41 -0
- data/lib/elevenlabs_client/endpoints/speech_to_speech.rb +125 -0
- data/lib/elevenlabs_client/endpoints/speech_to_text.rb +108 -0
- data/lib/elevenlabs_client/endpoints/text_to_dialogue_stream.rb +50 -0
- data/lib/elevenlabs_client/endpoints/text_to_speech_stream.rb +1 -0
- data/lib/elevenlabs_client/endpoints/text_to_speech_stream_with_timestamps.rb +75 -0
- data/lib/elevenlabs_client/endpoints/text_to_speech_with_timestamps.rb +73 -0
- data/lib/elevenlabs_client/endpoints/voices.rb +362 -0
- data/lib/elevenlabs_client/endpoints/websocket_text_to_speech.rb +250 -0
- data/lib/elevenlabs_client/version.rb +1 -1
- data/lib/elevenlabs_client.rb +9 -2
- metadata +25 -2
@@ -8,6 +8,7 @@ module ElevenlabsClient
|
|
8
8
|
|
9
9
|
# POST /v1/dubbing (multipart)
|
10
10
|
# Creates a new dubbing job
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/create
|
11
12
|
#
|
12
13
|
# @param file_io [IO] The audio/video file to dub
|
13
14
|
# @param filename [String] Original filename
|
@@ -19,8 +20,9 @@ module ElevenlabsClient
|
|
19
20
|
payload = {
|
20
21
|
file: @client.file_part(file_io, filename),
|
21
22
|
mode: "automatic",
|
22
|
-
|
23
|
-
|
23
|
+
name: name,
|
24
|
+
target_lang: target_languages.first,
|
25
|
+
num_speakers: 1
|
24
26
|
}.compact.merge(options)
|
25
27
|
|
26
28
|
@client.post_multipart("/v1/dubbing", payload)
|
@@ -28,6 +30,7 @@ module ElevenlabsClient
|
|
28
30
|
|
29
31
|
# GET /v1/dubbing/{id}
|
30
32
|
# Retrieves dubbing job details
|
33
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/get
|
31
34
|
#
|
32
35
|
# @param dubbing_id [String] The dubbing job ID
|
33
36
|
# @return [Hash] Dubbing job details
|
@@ -37,6 +40,7 @@ module ElevenlabsClient
|
|
37
40
|
|
38
41
|
# GET /v1/dubbing
|
39
42
|
# Lists dubbing jobs
|
43
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing
|
40
44
|
#
|
41
45
|
# @param params [Hash] Query parameters (dubbing_status, page_size, etc.)
|
42
46
|
# @return [Hash] List of dubbing jobs
|
@@ -46,6 +50,7 @@ module ElevenlabsClient
|
|
46
50
|
|
47
51
|
# GET /v1/dubbing/{id}/resources
|
48
52
|
# Retrieves dubbing resources for editing (if dubbing_studio: true was used)
|
53
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/get-resource
|
49
54
|
#
|
50
55
|
# @param dubbing_id [String] The dubbing job ID
|
51
56
|
# @return [Hash] Dubbing resources
|
@@ -53,6 +58,207 @@ module ElevenlabsClient
|
|
53
58
|
@client.get("/v1/dubbing/#{dubbing_id}/resources")
|
54
59
|
end
|
55
60
|
|
61
|
+
# DELETE /v1/dubbing/{id}
|
62
|
+
# Deletes a dubbing project
|
63
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/delete
|
64
|
+
#
|
65
|
+
# @param dubbing_id [String] The dubbing job ID
|
66
|
+
# @return [Hash] Response with status
|
67
|
+
def delete(dubbing_id)
|
68
|
+
@client.delete("/v1/dubbing/#{dubbing_id}")
|
69
|
+
end
|
70
|
+
|
71
|
+
# GET /v1/dubbing/resource/{dubbing_id}
|
72
|
+
# Gets dubbing resource with detailed information including segments, speakers, etc.
|
73
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/get-resource
|
74
|
+
#
|
75
|
+
# @param dubbing_id [String] The dubbing job ID
|
76
|
+
# @return [Hash] Detailed dubbing resource information
|
77
|
+
def get_resource(dubbing_id)
|
78
|
+
@client.get("/v1/dubbing/resource/#{dubbing_id}")
|
79
|
+
end
|
80
|
+
|
81
|
+
# POST /v1/dubbing/resource/{dubbing_id}/speaker/{speaker_id}/segment
|
82
|
+
# Creates a new segment in dubbing resource
|
83
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/create-segment
|
84
|
+
#
|
85
|
+
# @param dubbing_id [String] The dubbing job ID
|
86
|
+
# @param speaker_id [String] The speaker ID
|
87
|
+
# @param start_time [Float] Start time of the segment
|
88
|
+
# @param end_time [Float] End time of the segment
|
89
|
+
# @param text [String, nil] Optional text for the segment
|
90
|
+
# @param translations [Hash, nil] Optional translations map
|
91
|
+
# @return [Hash] Response with version and new segment ID
|
92
|
+
def create_segment(dubbing_id:, speaker_id:, start_time:, end_time:, text: nil, translations: nil)
|
93
|
+
payload = {
|
94
|
+
start_time: start_time,
|
95
|
+
end_time: end_time,
|
96
|
+
text: text,
|
97
|
+
translations: translations
|
98
|
+
}.compact
|
99
|
+
|
100
|
+
@client.post("/v1/dubbing/resource/#{dubbing_id}/speaker/#{speaker_id}/segment", payload)
|
101
|
+
end
|
102
|
+
|
103
|
+
# DELETE /v1/dubbing/resource/{dubbing_id}/segment/{segment_id}
|
104
|
+
# Deletes a single segment from the dubbing
|
105
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/delete-segment
|
106
|
+
#
|
107
|
+
# @param dubbing_id [String] The dubbing job ID
|
108
|
+
# @param segment_id [String] The segment ID
|
109
|
+
# @return [Hash] Response with version
|
110
|
+
def delete_segment(dubbing_id, segment_id)
|
111
|
+
@client.delete("/v1/dubbing/resource/#{dubbing_id}/segment/#{segment_id}")
|
112
|
+
end
|
113
|
+
|
114
|
+
# PATCH /v1/dubbing/resource/{dubbing_id}/segment/{segment_id}/{language}
|
115
|
+
# Updates a single segment with new text and/or start/end times
|
116
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/update-segment
|
117
|
+
#
|
118
|
+
# @param dubbing_id [String] The dubbing job ID
|
119
|
+
# @param segment_id [String] The segment ID
|
120
|
+
# @param language [String] The language ID
|
121
|
+
# @param start_time [Float, nil] Optional new start time
|
122
|
+
# @param end_time [Float, nil] Optional new end time
|
123
|
+
# @param text [String, nil] Optional new text
|
124
|
+
# @return [Hash] Response with version
|
125
|
+
def update_segment(dubbing_id:, segment_id:, language:, start_time: nil, end_time: nil, text: nil)
|
126
|
+
payload = {
|
127
|
+
start_time: start_time,
|
128
|
+
end_time: end_time,
|
129
|
+
text: text
|
130
|
+
}.compact
|
131
|
+
|
132
|
+
@client.patch("/v1/dubbing/resource/#{dubbing_id}/segment/#{segment_id}/#{language}", payload)
|
133
|
+
end
|
134
|
+
|
135
|
+
# POST /v1/dubbing/resource/{dubbing_id}/transcribe
|
136
|
+
# Regenerates transcriptions for specified segments
|
137
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/transcribe-segment
|
138
|
+
#
|
139
|
+
# @param dubbing_id [String] The dubbing job ID
|
140
|
+
# @param segments [Array<String>] List of segment IDs to transcribe
|
141
|
+
# @return [Hash] Response with version
|
142
|
+
def transcribe_segment(dubbing_id, segments)
|
143
|
+
payload = { segments: segments }
|
144
|
+
@client.post("/v1/dubbing/resource/#{dubbing_id}/transcribe", payload)
|
145
|
+
end
|
146
|
+
|
147
|
+
# POST /v1/dubbing/resource/{dubbing_id}/translate
|
148
|
+
# Regenerates translations for specified segments/languages
|
149
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/translate-segment
|
150
|
+
#
|
151
|
+
# @param dubbing_id [String] The dubbing job ID
|
152
|
+
# @param segments [Array<String>] List of segment IDs to translate
|
153
|
+
# @param languages [Array<String>, nil] Optional list of languages to translate
|
154
|
+
# @return [Hash] Response with version
|
155
|
+
def translate_segment(dubbing_id, segments, languages = nil)
|
156
|
+
payload = {
|
157
|
+
segments: segments,
|
158
|
+
languages: languages
|
159
|
+
}.compact
|
160
|
+
|
161
|
+
@client.post("/v1/dubbing/resource/#{dubbing_id}/translate", payload)
|
162
|
+
end
|
163
|
+
|
164
|
+
# POST /v1/dubbing/resource/{dubbing_id}/dub
|
165
|
+
# Regenerates dubs for specified segments/languages
|
166
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/dub-segment
|
167
|
+
#
|
168
|
+
# @param dubbing_id [String] The dubbing job ID
|
169
|
+
# @param segments [Array<String>] List of segment IDs to dub
|
170
|
+
# @param languages [Array<String>, nil] Optional list of languages to dub
|
171
|
+
# @return [Hash] Response with version
|
172
|
+
def dub_segment(dubbing_id, segments, languages = nil)
|
173
|
+
payload = {
|
174
|
+
segments: segments,
|
175
|
+
languages: languages
|
176
|
+
}.compact
|
177
|
+
|
178
|
+
@client.post("/v1/dubbing/resource/#{dubbing_id}/dub", payload)
|
179
|
+
end
|
180
|
+
|
181
|
+
# POST /v1/dubbing/resource/{dubbing_id}/render/{language}
|
182
|
+
# Renders the output media for a language
|
183
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/render-project
|
184
|
+
#
|
185
|
+
# @param dubbing_id [String] The dubbing job ID
|
186
|
+
# @param language [String] The language to render
|
187
|
+
# @param render_type [String] The type of render (mp4, aac, mp3, wav, aaf, tracks_zip, clips_zip)
|
188
|
+
# @param normalize_volume [Boolean, nil] Whether to normalize volume (defaults to false)
|
189
|
+
# @return [Hash] Response with version and render_id
|
190
|
+
def render_project(dubbing_id:, language:, render_type:, normalize_volume: nil)
|
191
|
+
payload = {
|
192
|
+
render_type: render_type,
|
193
|
+
normalize_volume: normalize_volume
|
194
|
+
}.compact
|
195
|
+
|
196
|
+
@client.post("/v1/dubbing/resource/#{dubbing_id}/render/#{language}", payload)
|
197
|
+
end
|
198
|
+
|
199
|
+
# PATCH /v1/dubbing/resource/{dubbing_id}/speaker/{speaker_id}
|
200
|
+
# Updates speaker metadata such as voice
|
201
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/update-speaker
|
202
|
+
#
|
203
|
+
# @param dubbing_id [String] The dubbing job ID
|
204
|
+
# @param speaker_id [String] The speaker ID
|
205
|
+
# @param voice_id [String, nil] Voice ID from library or 'track-clone'/'clip-clone'
|
206
|
+
# @param languages [Array<String>, nil] Languages to apply changes to
|
207
|
+
# @return [Hash] Response with version
|
208
|
+
def update_speaker(dubbing_id:, speaker_id:, voice_id: nil, languages: nil)
|
209
|
+
payload = {
|
210
|
+
voice_id: voice_id,
|
211
|
+
languages: languages
|
212
|
+
}.compact
|
213
|
+
|
214
|
+
@client.patch("/v1/dubbing/resource/#{dubbing_id}/speaker/#{speaker_id}", payload)
|
215
|
+
end
|
216
|
+
|
217
|
+
# GET /v1/dubbing/resource/{dubbing_id}/speaker/{speaker_id}/similar-voices
|
218
|
+
# Gets similar voices for a speaker
|
219
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/resources/get-similar-voices
|
220
|
+
#
|
221
|
+
# @param dubbing_id [String] The dubbing job ID
|
222
|
+
# @param speaker_id [String] The speaker ID
|
223
|
+
# @return [Hash] Response with list of similar voices
|
224
|
+
def get_similar_voices(dubbing_id, speaker_id)
|
225
|
+
@client.get("/v1/dubbing/resource/#{dubbing_id}/speaker/#{speaker_id}/similar-voices")
|
226
|
+
end
|
227
|
+
|
228
|
+
# GET /v1/dubbing/{dubbing_id}/audio/{language_code}
|
229
|
+
# Returns dub as a streamed MP3 or MP4 file
|
230
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/audio/get
|
231
|
+
#
|
232
|
+
# @param dubbing_id [String] ID of the dubbing project
|
233
|
+
# @param language_code [String] ID of the language
|
234
|
+
# @return [String] Binary audio/video data
|
235
|
+
def get_dubbed_audio(dubbing_id, language_code)
|
236
|
+
endpoint = "/v1/dubbing/#{dubbing_id}/audio/#{language_code}"
|
237
|
+
@client.get(endpoint)
|
238
|
+
end
|
239
|
+
|
240
|
+
# GET /v1/dubbing/{dubbing_id}/transcript/{language_code}
|
241
|
+
# Returns transcript for the dub as an SRT or WEBVTT file
|
242
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/dubbing/transcript/get-transcript-for-dub
|
243
|
+
#
|
244
|
+
# @param dubbing_id [String] ID of the dubbing project
|
245
|
+
# @param language_code [String] ID of the language
|
246
|
+
# @param options [Hash] Optional parameters
|
247
|
+
# @option options [String] :format_type Format to use ("srt" or "webvtt", default: "srt")
|
248
|
+
# @return [String] Transcript in specified format
|
249
|
+
def get_dubbed_transcript(dubbing_id, language_code, **options)
|
250
|
+
endpoint = "/v1/dubbing/#{dubbing_id}/transcript/#{language_code}"
|
251
|
+
|
252
|
+
params = {}
|
253
|
+
params[:format_type] = options[:format_type] if options[:format_type]
|
254
|
+
|
255
|
+
@client.get(endpoint, params)
|
256
|
+
end
|
257
|
+
|
258
|
+
# Alias methods for convenience
|
259
|
+
alias_method :dubbed_audio, :get_dubbed_audio
|
260
|
+
alias_method :dubbed_transcript, :get_dubbed_transcript
|
261
|
+
|
56
262
|
private
|
57
263
|
|
58
264
|
attr_reader :client
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ElevenlabsClient
|
4
|
+
class ForcedAlignment
|
5
|
+
def initialize(client)
|
6
|
+
@client = client
|
7
|
+
end
|
8
|
+
|
9
|
+
# POST /v1/forced-alignment
|
10
|
+
# Force align an audio file to text. Get timing information for each character and word
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/forced-alignment
|
12
|
+
#
|
13
|
+
# @param audio_file [IO, File] The audio file to align (must be less than 1GB)
|
14
|
+
# @param filename [String] Original filename for the audio file
|
15
|
+
# @param text [String] The text to align with the audio
|
16
|
+
# @param options [Hash] Optional parameters
|
17
|
+
# @option options [Boolean] :enabled_spooled_file Stream file in chunks for large files (defaults to false)
|
18
|
+
# @return [Hash] JSON response containing characters, words arrays with timing info, and loss score
|
19
|
+
def create(audio_file, filename, text, **options)
|
20
|
+
endpoint = "/v1/forced-alignment"
|
21
|
+
|
22
|
+
payload = {
|
23
|
+
file: @client.file_part(audio_file, filename),
|
24
|
+
text: text
|
25
|
+
}
|
26
|
+
|
27
|
+
# Add optional parameters if provided
|
28
|
+
payload[:enabled_spooled_file] = options[:enabled_spooled_file] unless options[:enabled_spooled_file].nil?
|
29
|
+
|
30
|
+
@client.post_multipart(endpoint, payload)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Alias methods for convenience
|
34
|
+
alias_method :align, :create
|
35
|
+
alias_method :force_align, :create
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
attr_reader :client
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ElevenlabsClient
|
4
|
+
class SpeechToSpeech
|
5
|
+
def initialize(client)
|
6
|
+
@client = client
|
7
|
+
end
|
8
|
+
|
9
|
+
# POST /v1/speech-to-speech/:voice_id
|
10
|
+
# Transform audio from one voice to another. Maintain full control over emotion, timing and delivery.
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/speech-to-speech
|
12
|
+
#
|
13
|
+
# @param voice_id [String] ID of the voice to be used
|
14
|
+
# @param audio_file [IO, File] The audio file which holds the content and emotion
|
15
|
+
# @param filename [String] Original filename for the audio file
|
16
|
+
# @param options [Hash] Optional parameters
|
17
|
+
# @option options [Boolean] :enable_logging Enable logging (default: true)
|
18
|
+
# @option options [Integer] :optimize_streaming_latency Latency optimization level (0-4, deprecated)
|
19
|
+
# @option options [String] :output_format Output format (default: "mp3_44100_128")
|
20
|
+
# @option options [String] :model_id Model identifier (default: "eleven_english_sts_v2")
|
21
|
+
# @option options [String] :voice_settings JSON encoded voice settings
|
22
|
+
# @option options [Integer] :seed Deterministic sampling seed (0-4294967295)
|
23
|
+
# @option options [Boolean] :remove_background_noise Remove background noise (default: false)
|
24
|
+
# @option options [String] :file_format Input file format ("pcm_s16le_16" or "other")
|
25
|
+
# @return [String] Binary audio data
|
26
|
+
def convert(voice_id, audio_file, filename, **options)
|
27
|
+
endpoint = "/v1/speech-to-speech/#{voice_id}"
|
28
|
+
|
29
|
+
# Build query parameters
|
30
|
+
query_params = {}
|
31
|
+
query_params[:enable_logging] = options[:enable_logging] unless options[:enable_logging].nil?
|
32
|
+
query_params[:optimize_streaming_latency] = options[:optimize_streaming_latency] if options[:optimize_streaming_latency]
|
33
|
+
query_params[:output_format] = options[:output_format] if options[:output_format]
|
34
|
+
|
35
|
+
# Add query parameters to endpoint if any exist
|
36
|
+
if query_params.any?
|
37
|
+
query_string = query_params.map { |k, v| "#{k}=#{v}" }.join("&")
|
38
|
+
endpoint += "?#{query_string}"
|
39
|
+
end
|
40
|
+
|
41
|
+
# Build multipart payload
|
42
|
+
payload = {
|
43
|
+
audio: @client.file_part(audio_file, filename)
|
44
|
+
}
|
45
|
+
|
46
|
+
# Add optional form parameters
|
47
|
+
payload[:model_id] = options[:model_id] if options[:model_id]
|
48
|
+
payload[:voice_settings] = options[:voice_settings] if options[:voice_settings]
|
49
|
+
payload[:seed] = options[:seed] if options[:seed]
|
50
|
+
payload[:remove_background_noise] = options[:remove_background_noise] unless options[:remove_background_noise].nil?
|
51
|
+
payload[:file_format] = options[:file_format] if options[:file_format]
|
52
|
+
|
53
|
+
@client.post_multipart(endpoint, payload)
|
54
|
+
end
|
55
|
+
|
56
|
+
# POST /v1/speech-to-speech/:voice_id/stream
|
57
|
+
# Stream audio from one voice to another. Maintain full control over emotion, timing and delivery.
|
58
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/speech-to-speech/stream
|
59
|
+
#
|
60
|
+
# @param voice_id [String] ID of the voice to be used
|
61
|
+
# @param audio_file [IO, File] The audio file which holds the content and emotion
|
62
|
+
# @param filename [String] Original filename for the audio file
|
63
|
+
# @param options [Hash] Optional parameters
|
64
|
+
# @option options [Boolean] :enable_logging Enable logging (default: true)
|
65
|
+
# @option options [Integer] :optimize_streaming_latency Latency optimization level (0-4, deprecated)
|
66
|
+
# @option options [String] :output_format Output format (default: "mp3_44100_128")
|
67
|
+
# @option options [String] :model_id Model identifier (default: "eleven_english_sts_v2")
|
68
|
+
# @option options [String] :voice_settings JSON encoded voice settings
|
69
|
+
# @option options [Integer] :seed Deterministic sampling seed (0-4294967295)
|
70
|
+
# @option options [Boolean] :remove_background_noise Remove background noise (default: false)
|
71
|
+
# @option options [String] :file_format Input file format ("pcm_s16le_16" or "other")
|
72
|
+
# @param block [Proc] Block to handle each chunk of streaming audio data
|
73
|
+
# @return [Faraday::Response] Response object for streaming
|
74
|
+
def convert_stream(voice_id, audio_file, filename, **options, &block)
|
75
|
+
endpoint = "/v1/speech-to-speech/#{voice_id}/stream"
|
76
|
+
|
77
|
+
# Build query parameters
|
78
|
+
query_params = {}
|
79
|
+
query_params[:enable_logging] = options[:enable_logging] unless options[:enable_logging].nil?
|
80
|
+
query_params[:optimize_streaming_latency] = options[:optimize_streaming_latency] if options[:optimize_streaming_latency]
|
81
|
+
query_params[:output_format] = options[:output_format] if options[:output_format]
|
82
|
+
|
83
|
+
# Add query parameters to endpoint if any exist
|
84
|
+
if query_params.any?
|
85
|
+
query_string = query_params.map { |k, v| "#{k}=#{v}" }.join("&")
|
86
|
+
endpoint += "?#{query_string}"
|
87
|
+
end
|
88
|
+
|
89
|
+
# Build multipart payload
|
90
|
+
payload = {
|
91
|
+
audio: @client.file_part(audio_file, filename)
|
92
|
+
}
|
93
|
+
|
94
|
+
# Add optional form parameters
|
95
|
+
payload[:model_id] = options[:model_id] if options[:model_id]
|
96
|
+
payload[:voice_settings] = options[:voice_settings] if options[:voice_settings]
|
97
|
+
payload[:seed] = options[:seed] if options[:seed]
|
98
|
+
payload[:remove_background_noise] = options[:remove_background_noise] unless options[:remove_background_noise].nil?
|
99
|
+
payload[:file_format] = options[:file_format] if options[:file_format]
|
100
|
+
|
101
|
+
# Use streaming multipart request
|
102
|
+
response = @client.instance_variable_get(:@conn).post(endpoint) do |req|
|
103
|
+
req.headers["xi-api-key"] = @client.api_key
|
104
|
+
req.body = payload
|
105
|
+
|
106
|
+
# Set up streaming callback if block provided
|
107
|
+
if block_given?
|
108
|
+
req.options.on_data = proc do |chunk, _|
|
109
|
+
block.call(chunk)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
@client.send(:handle_response, response)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Alias methods for convenience
|
118
|
+
alias_method :voice_changer, :convert
|
119
|
+
alias_method :voice_changer_stream, :convert_stream
|
120
|
+
|
121
|
+
private
|
122
|
+
|
123
|
+
attr_reader :client
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ElevenlabsClient
|
4
|
+
class SpeechToText
|
5
|
+
def initialize(client)
|
6
|
+
@client = client
|
7
|
+
end
|
8
|
+
|
9
|
+
# POST /v1/speech-to-text
|
10
|
+
# Transcribe an audio or video file
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/speech-to-text
|
12
|
+
#
|
13
|
+
# @param model_id [String] The ID of the model to use for transcription
|
14
|
+
# @param options [Hash] Optional parameters
|
15
|
+
# @option options [IO, File] :file The file to transcribe (required if no cloud_storage_url)
|
16
|
+
# @option options [String] :filename Original filename (required if file provided)
|
17
|
+
# @option options [String] :cloud_storage_url HTTPS URL of file to transcribe (required if no file)
|
18
|
+
# @option options [Boolean] :enable_logging Enable logging (default: true)
|
19
|
+
# @option options [String] :language_code ISO-639-1 or ISO-639-3 language code
|
20
|
+
# @option options [Boolean] :tag_audio_events Tag audio events like (laughter) (default: true)
|
21
|
+
# @option options [Integer] :num_speakers Maximum number of speakers (1-32)
|
22
|
+
# @option options [String] :timestamps_granularity Timestamp granularity ("none", "word", "character")
|
23
|
+
# @option options [Boolean] :diarize Annotate which speaker is talking (default: false)
|
24
|
+
# @option options [Float] :diarization_threshold Diarization threshold (0.1-0.4)
|
25
|
+
# @option options [Array] :additional_formats Additional export formats
|
26
|
+
# @option options [String] :file_format Input file format ("pcm_s16le_16" or "other")
|
27
|
+
# @option options [Boolean] :webhook Send result to webhook (default: false)
|
28
|
+
# @option options [String] :webhook_id Specific webhook ID
|
29
|
+
# @option options [Float] :temperature Randomness control (0.0-2.0)
|
30
|
+
# @option options [Integer] :seed Deterministic sampling seed (0-2147483647)
|
31
|
+
# @option options [Boolean] :use_multi_channel Multi-channel processing (default: false)
|
32
|
+
# @option options [String, Hash] :webhook_metadata Metadata for webhook
|
33
|
+
# @return [Hash] Transcription result or webhook response
|
34
|
+
def create(model_id, **options)
|
35
|
+
endpoint = "/v1/speech-to-text"
|
36
|
+
|
37
|
+
# Build query parameters
|
38
|
+
query_params = {}
|
39
|
+
query_params[:enable_logging] = options[:enable_logging] unless options[:enable_logging].nil?
|
40
|
+
|
41
|
+
# Add query parameters to endpoint if any exist
|
42
|
+
if query_params.any?
|
43
|
+
query_string = query_params.map { |k, v| "#{k}=#{v}" }.join("&")
|
44
|
+
endpoint += "?#{query_string}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Build multipart payload
|
48
|
+
payload = {
|
49
|
+
model_id: model_id
|
50
|
+
}
|
51
|
+
|
52
|
+
# Add file or cloud storage URL (exactly one is required)
|
53
|
+
if options[:file] && options[:filename]
|
54
|
+
payload[:file] = @client.file_part(options[:file], options[:filename])
|
55
|
+
elsif options[:cloud_storage_url]
|
56
|
+
payload[:cloud_storage_url] = options[:cloud_storage_url]
|
57
|
+
else
|
58
|
+
raise ArgumentError, "Either :file with :filename or :cloud_storage_url must be provided"
|
59
|
+
end
|
60
|
+
|
61
|
+
# Add optional form parameters
|
62
|
+
payload[:language_code] = options[:language_code] if options[:language_code]
|
63
|
+
payload[:tag_audio_events] = options[:tag_audio_events] unless options[:tag_audio_events].nil?
|
64
|
+
payload[:num_speakers] = options[:num_speakers] if options[:num_speakers]
|
65
|
+
payload[:timestamps_granularity] = options[:timestamps_granularity] if options[:timestamps_granularity]
|
66
|
+
payload[:diarize] = options[:diarize] unless options[:diarize].nil?
|
67
|
+
payload[:diarization_threshold] = options[:diarization_threshold] if options[:diarization_threshold]
|
68
|
+
payload[:additional_formats] = options[:additional_formats] if options[:additional_formats]
|
69
|
+
payload[:file_format] = options[:file_format] if options[:file_format]
|
70
|
+
payload[:webhook] = options[:webhook] unless options[:webhook].nil?
|
71
|
+
payload[:webhook_id] = options[:webhook_id] if options[:webhook_id]
|
72
|
+
payload[:temperature] = options[:temperature] if options[:temperature]
|
73
|
+
payload[:seed] = options[:seed] if options[:seed]
|
74
|
+
payload[:use_multi_channel] = options[:use_multi_channel] unless options[:use_multi_channel].nil?
|
75
|
+
|
76
|
+
# Handle webhook_metadata (can be string or hash)
|
77
|
+
if options[:webhook_metadata]
|
78
|
+
if options[:webhook_metadata].is_a?(Hash)
|
79
|
+
payload[:webhook_metadata] = options[:webhook_metadata].to_json
|
80
|
+
else
|
81
|
+
payload[:webhook_metadata] = options[:webhook_metadata]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
@client.post_multipart(endpoint, payload)
|
86
|
+
end
|
87
|
+
|
88
|
+
# GET /v1/speech-to-text/transcripts/:transcription_id
|
89
|
+
# Retrieve a previously generated transcript by its ID
|
90
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/speech-to-text/get-transcript
|
91
|
+
#
|
92
|
+
# @param transcription_id [String] The unique ID of the transcript to retrieve
|
93
|
+
# @return [Hash] The transcript data
|
94
|
+
def get_transcript(transcription_id)
|
95
|
+
endpoint = "/v1/speech-to-text/transcripts/#{transcription_id}"
|
96
|
+
@client.get(endpoint)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Alias methods for convenience
|
100
|
+
alias_method :transcribe, :create
|
101
|
+
alias_method :get_transcription, :get_transcript
|
102
|
+
alias_method :retrieve_transcript, :get_transcript
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
attr_reader :client
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ElevenlabsClient
|
4
|
+
class TextToDialogueStream
|
5
|
+
def initialize(client)
|
6
|
+
@client = client
|
7
|
+
end
|
8
|
+
|
9
|
+
# POST /v1/text-to-dialogue/stream
|
10
|
+
# Converts a list of text and voice ID pairs into speech (dialogue) and returns an audio stream.
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/text-to-dialogue/stream
|
12
|
+
#
|
13
|
+
# @param inputs [Array<Hash>] A list of dialogue inputs, each containing text and a voice ID
|
14
|
+
# @param options [Hash] Optional parameters
|
15
|
+
# @option options [String] :model_id Identifier of the model to be used (default: "eleven_v3")
|
16
|
+
# @option options [String] :language_code ISO 639-1 language code
|
17
|
+
# @option options [Hash] :settings Settings controlling the dialogue generation
|
18
|
+
# @option options [Array<Hash>] :pronunciation_dictionary_locators Pronunciation dictionary locators (max 3)
|
19
|
+
# @option options [Integer] :seed Deterministic sampling seed (0-4294967295)
|
20
|
+
# @option options [String] :apply_text_normalization Text normalization mode ("auto", "on", "off")
|
21
|
+
# @option options [String] :output_format Output format (defaults to "mp3_44100_128")
|
22
|
+
# @param block [Proc] Block to handle each audio chunk
|
23
|
+
# @return [Faraday::Response] The response object
|
24
|
+
def stream(inputs, **options, &block)
|
25
|
+
# Build endpoint with optional query params
|
26
|
+
output_format = options[:output_format] || "mp3_44100_128"
|
27
|
+
endpoint = "/v1/text-to-dialogue/stream?output_format=#{output_format}"
|
28
|
+
|
29
|
+
# Build request body
|
30
|
+
request_body = { inputs: inputs }
|
31
|
+
request_body[:model_id] = options[:model_id] if options[:model_id]
|
32
|
+
request_body[:language_code] = options[:language_code] if options[:language_code]
|
33
|
+
request_body[:settings] = options[:settings] if options[:settings]
|
34
|
+
request_body[:pronunciation_dictionary_locators] = options[:pronunciation_dictionary_locators] if options[:pronunciation_dictionary_locators]
|
35
|
+
request_body[:seed] = options[:seed] if options[:seed]
|
36
|
+
request_body[:apply_text_normalization] = options[:apply_text_normalization] if options[:apply_text_normalization]
|
37
|
+
|
38
|
+
@client.post_streaming(endpoint, request_body, &block)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Alias for convenience
|
42
|
+
alias_method :text_to_dialogue_stream, :stream
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
attr_reader :client
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
@@ -8,6 +8,7 @@ module ElevenlabsClient
|
|
8
8
|
|
9
9
|
# POST /v1/text-to-speech/{voice_id}/stream
|
10
10
|
# Stream text-to-speech audio in real-time chunks
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/text-to-speech/stream
|
11
12
|
#
|
12
13
|
# @param voice_id [String] The ID of the voice to use
|
13
14
|
# @param text [String] Text to synthesize
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ElevenlabsClient
|
4
|
+
class TextToSpeechStreamWithTimestamps
|
5
|
+
def initialize(client)
|
6
|
+
@client = client
|
7
|
+
end
|
8
|
+
|
9
|
+
# POST /v1/text-to-speech/{voice_id}/stream/with-timestamps
|
10
|
+
# Stream text-to-speech audio with character-level timing information
|
11
|
+
# Documentation: https://elevenlabs.io/docs/api-reference/text-to-speech/stream-with-timestamps
|
12
|
+
#
|
13
|
+
# @param voice_id [String] Voice ID to be used
|
14
|
+
# @param text [String] The text that will get converted into speech
|
15
|
+
# @param options [Hash] Optional TTS parameters
|
16
|
+
# @option options [String] :model_id Model identifier (defaults to "eleven_multilingual_v2")
|
17
|
+
# @option options [String] :language_code ISO 639-1 language code for text normalization
|
18
|
+
# @option options [Hash] :voice_settings Voice settings overriding stored settings
|
19
|
+
# @option options [Array<Hash>] :pronunciation_dictionary_locators Pronunciation dictionary locators (max 3)
|
20
|
+
# @option options [Integer] :seed Deterministic sampling seed (0-4294967295)
|
21
|
+
# @option options [String] :previous_text Text that came before current request
|
22
|
+
# @option options [String] :next_text Text that comes after current request
|
23
|
+
# @option options [Array<String>] :previous_request_ids Request IDs of previous samples (max 3)
|
24
|
+
# @option options [Array<String>] :next_request_ids Request IDs of next samples (max 3)
|
25
|
+
# @option options [String] :apply_text_normalization Text normalization mode ("auto", "on", "off")
|
26
|
+
# @option options [Boolean] :apply_language_text_normalization Language text normalization
|
27
|
+
# @option options [Boolean] :use_pvc_as_ivc Use IVC version instead of PVC (deprecated)
|
28
|
+
# @option options [Boolean] :enable_logging Enable logging (defaults to true)
|
29
|
+
# @option options [Integer] :optimize_streaming_latency Latency optimizations (0-4, deprecated)
|
30
|
+
# @option options [String] :output_format Output format (defaults to "mp3_44100_128")
|
31
|
+
# @param block [Proc] Block to handle each streaming chunk containing audio and timing data
|
32
|
+
# @return [Faraday::Response] The response object
|
33
|
+
def stream(voice_id, text, **options, &block)
|
34
|
+
# Build query parameters
|
35
|
+
query_params = {}
|
36
|
+
query_params[:enable_logging] = options[:enable_logging] unless options[:enable_logging].nil?
|
37
|
+
query_params[:optimize_streaming_latency] = options[:optimize_streaming_latency] if options[:optimize_streaming_latency]
|
38
|
+
query_params[:output_format] = options[:output_format] if options[:output_format]
|
39
|
+
|
40
|
+
# Build endpoint with query parameters
|
41
|
+
endpoint = "/v1/text-to-speech/#{voice_id}/stream/with-timestamps"
|
42
|
+
if query_params.any?
|
43
|
+
query_string = query_params.map { |k, v| "#{k}=#{v}" }.join("&")
|
44
|
+
endpoint += "?#{query_string}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Build request body
|
48
|
+
request_body = { text: text }
|
49
|
+
|
50
|
+
# Add optional body parameters
|
51
|
+
request_body[:model_id] = options[:model_id] if options[:model_id]
|
52
|
+
request_body[:language_code] = options[:language_code] if options[:language_code]
|
53
|
+
request_body[:voice_settings] = options[:voice_settings] if options[:voice_settings]
|
54
|
+
request_body[:pronunciation_dictionary_locators] = options[:pronunciation_dictionary_locators] if options[:pronunciation_dictionary_locators]
|
55
|
+
request_body[:seed] = options[:seed] if options[:seed]
|
56
|
+
request_body[:previous_text] = options[:previous_text] if options[:previous_text]
|
57
|
+
request_body[:next_text] = options[:next_text] if options[:next_text]
|
58
|
+
request_body[:previous_request_ids] = options[:previous_request_ids] if options[:previous_request_ids]
|
59
|
+
request_body[:next_request_ids] = options[:next_request_ids] if options[:next_request_ids]
|
60
|
+
request_body[:apply_text_normalization] = options[:apply_text_normalization] if options[:apply_text_normalization]
|
61
|
+
request_body[:apply_language_text_normalization] = options[:apply_language_text_normalization] unless options[:apply_language_text_normalization].nil?
|
62
|
+
request_body[:use_pvc_as_ivc] = options[:use_pvc_as_ivc] unless options[:use_pvc_as_ivc].nil?
|
63
|
+
|
64
|
+
# Use streaming method with JSON parsing for timestamp data
|
65
|
+
@client.post_streaming_with_timestamps(endpoint, request_body, &block)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Alias for backward compatibility
|
69
|
+
alias_method :text_to_speech_stream_with_timestamps, :stream
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
attr_reader :client
|
74
|
+
end
|
75
|
+
end
|