rb-edge-tts 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +17 -0
- data/LICENSE +195 -0
- data/README.md +257 -0
- data/Rakefile +8 -0
- data/examples/async_audio_gen_with_dynamic_voice_selection.rb +26 -0
- data/examples/async_audio_gen_with_predefined_voice.rb +13 -0
- data/examples/async_audio_gen_with_predefined_voice_async.rb +16 -0
- data/examples/async_audio_streaming_with_subtitles.rb +30 -0
- data/exe/rb-edge-playback +168 -0
- data/exe/rb-edge-tts +192 -0
- data/lib/edge_playback/version.rb +5 -0
- data/lib/rb_edge_tts/communicate.rb +336 -0
- data/lib/rb_edge_tts/constants.rb +38 -0
- data/lib/rb_edge_tts/drm.rb +82 -0
- data/lib/rb_edge_tts/exceptions.rb +10 -0
- data/lib/rb_edge_tts/srt_composer.rb +99 -0
- data/lib/rb_edge_tts/submaker.rb +39 -0
- data/lib/rb_edge_tts/typing.rb +93 -0
- data/lib/rb_edge_tts/util.rb +124 -0
- data/lib/rb_edge_tts/version.rb +6 -0
- data/lib/rb_edge_tts/voices_manager.rb +100 -0
- data/lib/rb_edge_tts.rb +19 -0
- data/lib/rb_edge_tts_simple.rb +24 -0
- data/rb-edge-tts.gemspec +42 -0
- metadata +196 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/rb_edge_tts'
|
|
5
|
+
require_relative '../lib/edge_playback/version'
|
|
6
|
+
require 'optparse'
|
|
7
|
+
require 'tempfile'
|
|
8
|
+
require 'open3'
|
|
9
|
+
|
|
10
|
+
module EdgePlayback
|
|
11
|
+
module CLI
|
|
12
|
+
class << self
|
|
13
|
+
def run(args)
|
|
14
|
+
options = parse_options(args)
|
|
15
|
+
check_dependencies(options[:use_mpv])
|
|
16
|
+
|
|
17
|
+
debug = ENV['EDGE_PLAYBACK_DEBUG']
|
|
18
|
+
keep = ENV['EDGE_PLAYBACK_KEEP_TEMP']
|
|
19
|
+
mp3_file = ENV['EDGE_PLAYBACK_MP3_FILE']
|
|
20
|
+
srt_file = ENV['EDGE_PLAYBACK_SRT_FILE']
|
|
21
|
+
|
|
22
|
+
begin
|
|
23
|
+
mp3_file, srt_file = create_temp_files(options[:use_mpv], mp3_file, srt_file, debug)
|
|
24
|
+
run_edge_tts(mp3_file, srt_file, args)
|
|
25
|
+
play_media(options[:use_mpv], mp3_file, srt_file)
|
|
26
|
+
ensure
|
|
27
|
+
cleanup(mp3_file, srt_file, keep)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def parse_options(args)
|
|
34
|
+
options = { use_mpv: !Gem.win_platform? }
|
|
35
|
+
remaining_args = []
|
|
36
|
+
|
|
37
|
+
while (arg = args.shift)
|
|
38
|
+
case arg
|
|
39
|
+
when '--mpv'
|
|
40
|
+
options[:use_mpv] = true
|
|
41
|
+
when '--version'
|
|
42
|
+
puts "rb-edge-playback #{EdgePlayback::VERSION}"
|
|
43
|
+
exit 0
|
|
44
|
+
when '-h', '--help'
|
|
45
|
+
puts "Usage: rb-edge-playback [options] [rb-edge-tts options]"
|
|
46
|
+
puts ""
|
|
47
|
+
puts "Options:"
|
|
48
|
+
puts " --mpv Use mpv to play audio"
|
|
49
|
+
puts " --version Show version"
|
|
50
|
+
puts " -h, --help Show this help message"
|
|
51
|
+
puts ""
|
|
52
|
+
puts "See 'rb-edge-tts --help' for additional arguments"
|
|
53
|
+
exit 0
|
|
54
|
+
else
|
|
55
|
+
remaining_args << arg
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Restore request arguments
|
|
60
|
+
remaining_args.each { |a| args << a }
|
|
61
|
+
|
|
62
|
+
options
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def check_dependencies(use_mpv)
|
|
66
|
+
missing = []
|
|
67
|
+
|
|
68
|
+
missing << 'rb-edge-tts' unless system('which rb-edge-tts > /dev/null 2>&1')
|
|
69
|
+
missing << 'mpv' if use_mpv && !system('which mpv > /dev/null 2>&1')
|
|
70
|
+
|
|
71
|
+
return if missing.empty?
|
|
72
|
+
|
|
73
|
+
missing.each { |dep| warn "#{dep} is not installed." }
|
|
74
|
+
warn 'Please install the missing dependencies.'
|
|
75
|
+
exit 1
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def create_temp_files(use_mpv, mp3_fname, srt_fname, debug)
|
|
79
|
+
unless mp3_fname
|
|
80
|
+
media = Tempfile.new(['rb-edge-playback-', '.mp3'])
|
|
81
|
+
media.close
|
|
82
|
+
mp3_fname = media.path
|
|
83
|
+
puts "Media file: #{mp3_fname}" if debug
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
unless srt_fname || !use_mpv
|
|
87
|
+
subtitle = Tempfile.new(['rb-edge-playback-', '.srt'])
|
|
88
|
+
subtitle.close
|
|
89
|
+
srt_fname = subtitle.path
|
|
90
|
+
puts "Subtitle file: #{srt_fname}" if debug
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
[mp3_fname, srt_fname]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def run_edge_tts(mp3_fname, srt_fname, tts_args)
|
|
97
|
+
edge_tts_cmd = ['rb-edge-tts', "--write-media=#{mp3_fname}"]
|
|
98
|
+
edge_tts_cmd << "--write-subtitles=#{srt_fname}" if srt_fname
|
|
99
|
+
edge_tts_cmd.concat(tts_args)
|
|
100
|
+
|
|
101
|
+
status = system(*edge_tts_cmd)
|
|
102
|
+
raise "rb-edge-tts failed with status #{$?.exitstatus}" unless status
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def play_media(use_mpv, mp3_fname, srt_fname)
|
|
106
|
+
if Gem.win_platform? && !use_mpv
|
|
107
|
+
play_mp3_win32(mp3_fname)
|
|
108
|
+
return
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
mpv_cmd = %w[mpv --msg-level=all=error,statusline=status]
|
|
112
|
+
mpv_cmd << "--sub-file=#{srt_fname}" if srt_fname
|
|
113
|
+
mpv_cmd << mp3_fname
|
|
114
|
+
|
|
115
|
+
status = system(*mpv_cmd)
|
|
116
|
+
warn "mpv failed with status #{$?.exitstatus}" unless status
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def play_mp3_win32(mp3_fname)
|
|
120
|
+
require 'fiddle'
|
|
121
|
+
|
|
122
|
+
kernel32 = Fiddle::Handle.new('kernel32')
|
|
123
|
+
winmm = Fiddle::Handle.new('winmm')
|
|
124
|
+
|
|
125
|
+
get_short_path_name_w = Fiddle::Function.new(
|
|
126
|
+
kernel32['GetShortPathNameW'],
|
|
127
|
+
[Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_DWORD],
|
|
128
|
+
Fiddle::TYPE_DWORD
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
get_long_path_name_w = Fiddle::Function.new(
|
|
132
|
+
kernel32['GetLongPathNameW'],
|
|
133
|
+
[Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_DWORD],
|
|
134
|
+
Fiddle::TYPE_DWORD
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
mci_send_string_w = Fiddle::Function.new(
|
|
138
|
+
winmm['mciSendStringW'],
|
|
139
|
+
[Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_DWORD, Fiddle::TYPE_VOIDP],
|
|
140
|
+
Fiddle::TYPE_DWORD
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
mp3_path = mp3_fname.encode('utf-16le')
|
|
144
|
+
buffer = "\0" * 260 * 2
|
|
145
|
+
|
|
146
|
+
get_short_path_name_w.call(mp3_path, buffer, 260)
|
|
147
|
+
short_name = buffer.strip.encode('utf-8')
|
|
148
|
+
|
|
149
|
+
mci_send_string_w.call('Close All', nil, 0, nil)
|
|
150
|
+
mci_send_string_w.call("Open \"#{short_name}\" Type MPEGVideo Alias theMP3", nil, 0, nil)
|
|
151
|
+
mci_send_string_w.call('Play theMP3 Wait', nil, 0, nil)
|
|
152
|
+
mci_send_string_w.call('Close theMP3', nil, 0, nil)
|
|
153
|
+
rescue LoadError => e
|
|
154
|
+
warn "Error loading Windows libraries: #{e.message}"
|
|
155
|
+
exit 1
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def cleanup(mp3_fname, srt_fname, keep)
|
|
159
|
+
return if keep
|
|
160
|
+
|
|
161
|
+
File.delete(mp3_fname) if mp3_fname && File.exist?(mp3_fname)
|
|
162
|
+
File.delete(srt_fname) if srt_fname && File.exist?(srt_fname)
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
EdgePlayback::CLI.run(ARGV)
|
data/exe/rb-edge-tts
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/rb_edge_tts'
|
|
5
|
+
require 'optparse'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
module RbEdgeTTS
|
|
9
|
+
module CLI
|
|
10
|
+
class << self
|
|
11
|
+
def run(args)
|
|
12
|
+
options = parse_options(args)
|
|
13
|
+
|
|
14
|
+
if options[:list_voices]
|
|
15
|
+
print_voices(options)
|
|
16
|
+
exit 0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
text = get_text(options)
|
|
20
|
+
run_tts(text, options)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def parse_options(args)
|
|
26
|
+
options = {
|
|
27
|
+
voice: RbEdgeTTS::DEFAULT_VOICE,
|
|
28
|
+
rate: '+0%',
|
|
29
|
+
volume: '+0%',
|
|
30
|
+
pitch: '+0Hz'
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
OptionParser.new do |opts|
|
|
34
|
+
opts.banner = "Usage: rb-edge-tts [options]"
|
|
35
|
+
opts.separator ""
|
|
36
|
+
opts.separator "Options:"
|
|
37
|
+
|
|
38
|
+
opts.on('-t', '--text TEXT', 'What TTS will say') do |t|
|
|
39
|
+
options[:text] = t
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
opts.on('-f', '--file FILE', 'Read text from file') do |f|
|
|
43
|
+
options[:file] = f
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
opts.on('-v', '--voice VOICE', "Voice for TTS (default: #{DEFAULT_VOICE})") do |v|
|
|
47
|
+
options[:voice] = v
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
opts.on('-l', '--list-voices', 'List available voices and exit') do
|
|
51
|
+
options[:list_voices] = true
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
opts.on('--rate RATE', 'Set TTS rate (e.g., +20%, -50%)') do |r|
|
|
55
|
+
options[:rate] = r
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
opts.on('--volume VOLUME', 'Set TTS volume (e.g., +10%, -20%)') do |v|
|
|
59
|
+
options[:volume] = v
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
opts.on('--pitch PITCH', 'Set TTS pitch (e.g., +5Hz, -10Hz)') do |p|
|
|
63
|
+
options[:pitch] = p
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
opts.on('--write-media FILE', 'Write media output to file instead of stdout') do |w|
|
|
67
|
+
options[:write_media] = w
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
opts.on('--write-subtitles FILE', 'Write subtitle output to file instead of stderr') do |w|
|
|
71
|
+
options[:write_subtitles] = w
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
opts.on('--proxy URL', 'Use a proxy for TTS and voice list') do |p|
|
|
75
|
+
options[:proxy] = p
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
opts.on('--verbose', 'Show debug information') do
|
|
79
|
+
options[:verbose] = true
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
opts.on('--version', 'Show version') do
|
|
83
|
+
puts "rb-edge-tts #{VERSION}"
|
|
84
|
+
exit 0
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
opts.on('-h', '--help', 'Show this help message') do
|
|
88
|
+
puts opts
|
|
89
|
+
exit 0
|
|
90
|
+
end
|
|
91
|
+
end.parse!(args)
|
|
92
|
+
|
|
93
|
+
raise OptionParser::MissingArgument, 'Must specify --text or --file' unless options[:text] || options[:file] || options[:list_voices]
|
|
94
|
+
|
|
95
|
+
options
|
|
96
|
+
rescue OptionParser::ParseError => e
|
|
97
|
+
warn "Error: #{e.message}"
|
|
98
|
+
warn "Use --help for usage information"
|
|
99
|
+
exit 1
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def get_text(options)
|
|
103
|
+
if options[:file]
|
|
104
|
+
if options[:file] == '-' || options[:file] == '/dev/stdin'
|
|
105
|
+
STDIN.read
|
|
106
|
+
else
|
|
107
|
+
File.read(options[:file], encoding: 'utf-8')
|
|
108
|
+
end
|
|
109
|
+
else
|
|
110
|
+
options[:text]
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def print_voices(options)
|
|
115
|
+
voices = RbEdgeTTS.list_voices(proxy: options[:proxy])
|
|
116
|
+
voices = voices.sort_by(&:short_name)
|
|
117
|
+
|
|
118
|
+
require 'terminal-table'
|
|
119
|
+
table = Terminal::Table.new(
|
|
120
|
+
headings: %w[Name Gender ContentCategories VoicePersonalities],
|
|
121
|
+
rows: voices.map do |v|
|
|
122
|
+
[
|
|
123
|
+
v.short_name,
|
|
124
|
+
v.gender,
|
|
125
|
+
v.voice_tag.content_categories.join(', '),
|
|
126
|
+
v.voice_tag.voice_personalities.join(', ')
|
|
127
|
+
]
|
|
128
|
+
end
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
puts table
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def run_tts(text, options)
|
|
135
|
+
if STDIN.tty? && STDOUT.tty? && !options[:write_media]
|
|
136
|
+
warn 'Warning: TTS output will be written to the terminal.'
|
|
137
|
+
warn 'Use --write-media to write to a file.'
|
|
138
|
+
warn 'Press Ctrl+C to cancel the operation.'
|
|
139
|
+
warn 'Press Enter to continue.'
|
|
140
|
+
STDIN.gets
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
communicate = RbEdgeTTS::Communicate.new(
|
|
144
|
+
text,
|
|
145
|
+
options[:voice],
|
|
146
|
+
rate: options[:rate],
|
|
147
|
+
volume: options[:volume],
|
|
148
|
+
pitch: options[:pitch],
|
|
149
|
+
proxy: options[:proxy],
|
|
150
|
+
verbose: options[:verbose]
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
submaker = RbEdgeTTS::SubMaker.new
|
|
154
|
+
|
|
155
|
+
audio_file = if options[:write_media] && options[:write_media] != '-'
|
|
156
|
+
File.open(options[:write_media], 'wb')
|
|
157
|
+
else
|
|
158
|
+
STDOUT.binmode
|
|
159
|
+
STDOUT
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
sub_file = if options[:write_subtitles] && options[:write_subtitles] != '-'
|
|
163
|
+
File.open(options[:write_subtitles], 'w', encoding: 'utf-8')
|
|
164
|
+
elsif options[:write_subtitles] == '-'
|
|
165
|
+
STDERR
|
|
166
|
+
else
|
|
167
|
+
nil
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
begin
|
|
171
|
+
communicate.stream do |chunk|
|
|
172
|
+
if chunk.type == 'audio'
|
|
173
|
+
audio_file.write(chunk.data)
|
|
174
|
+
elsif %w[WordBoundary SentenceBoundary].include?(chunk.type) && sub_file
|
|
175
|
+
submaker.feed(chunk)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
sub_file.puts(submaker.get_srt) if sub_file
|
|
180
|
+
ensure
|
|
181
|
+
audio_file.close if audio_file != STDOUT
|
|
182
|
+
sub_file.close if sub_file && sub_file != STDERR
|
|
183
|
+
end
|
|
184
|
+
rescue Interrupt
|
|
185
|
+
warn "\nOperation canceled."
|
|
186
|
+
exit 1
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
RbEdgeTTS::CLI.run(ARGV)
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'eventmachine'
|
|
4
|
+
require 'faye/websocket'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'openssl'
|
|
7
|
+
require 'time'
|
|
8
|
+
require 'securerandom'
|
|
9
|
+
|
|
10
|
+
require_relative 'typing'
|
|
11
|
+
require_relative 'constants'
|
|
12
|
+
require_relative 'drm'
|
|
13
|
+
require_relative 'util'
|
|
14
|
+
require_relative 'srt_composer'
|
|
15
|
+
require_relative 'submaker'
|
|
16
|
+
require_relative 'voices_manager'
|
|
17
|
+
|
|
18
|
+
module RbEdgeTTS
|
|
19
|
+
class Communicate
|
|
20
|
+
attr_accessor :texts, :proxy, :state, :tts_config
|
|
21
|
+
|
|
22
|
+
def initialize(text,
|
|
23
|
+
voice = RbEdgeTTS::DEFAULT_VOICE,
|
|
24
|
+
rate: '+0%',
|
|
25
|
+
volume: '+0%',
|
|
26
|
+
pitch: '+0Hz',
|
|
27
|
+
boundary: 'SentenceBoundary',
|
|
28
|
+
proxy: nil,
|
|
29
|
+
connect_timeout: 10,
|
|
30
|
+
receive_timeout: 60,
|
|
31
|
+
verbose: false)
|
|
32
|
+
raise TypeError, 'text must be a string' unless text.is_a?(String)
|
|
33
|
+
|
|
34
|
+
@tts_config = TTSConfig.new(voice, rate, volume, pitch, boundary)
|
|
35
|
+
|
|
36
|
+
@texts = Util.split_text_by_byte_length(Util.escape_xml(Util.remove_incompatible_characters(text)), 4096).to_a
|
|
37
|
+
|
|
38
|
+
@proxy = proxy
|
|
39
|
+
raise TypeError, 'proxy must be a string' if proxy && !proxy.is_a?(String)
|
|
40
|
+
|
|
41
|
+
raise TypeError, 'connect_timeout must be an integer' unless connect_timeout.is_a?(Integer)
|
|
42
|
+
raise TypeError, 'receive_timeout must be an integer' unless receive_timeout.is_a?(Integer)
|
|
43
|
+
|
|
44
|
+
@connect_timeout = connect_timeout
|
|
45
|
+
@receive_timeout = receive_timeout
|
|
46
|
+
@verbose = verbose
|
|
47
|
+
|
|
48
|
+
@state = CommunicateState.new(
|
|
49
|
+
partial_text: '',
|
|
50
|
+
offset_compensation: 0,
|
|
51
|
+
last_duration_offset: 0,
|
|
52
|
+
stream_was_called: false
|
|
53
|
+
)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def stream(&block)
|
|
57
|
+
raise 'stream can only be called once.' if @state.stream_was_called
|
|
58
|
+
|
|
59
|
+
@state.stream_was_called = true
|
|
60
|
+
|
|
61
|
+
@texts.each do |partial_text|
|
|
62
|
+
@state.partial_text = partial_text
|
|
63
|
+
stream_internal(&block)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def stream_sync
|
|
68
|
+
queue = Thread::Queue.new
|
|
69
|
+
|
|
70
|
+
thread = Thread.new do
|
|
71
|
+
stream do |chunk|
|
|
72
|
+
queue.push(chunk)
|
|
73
|
+
end
|
|
74
|
+
queue.push(nil)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
Enumerator.new do |yielder|
|
|
78
|
+
loop do
|
|
79
|
+
chunk = queue.pop
|
|
80
|
+
break if chunk.nil?
|
|
81
|
+
|
|
82
|
+
yielder << chunk
|
|
83
|
+
end
|
|
84
|
+
ensure
|
|
85
|
+
thread&.join
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def save(audio_fname, metadata_fname = nil, &block)
|
|
90
|
+
raise TypeError, 'audio_fname must be a string' unless audio_fname.is_a?(String)
|
|
91
|
+
raise TypeError, 'metadata_fname must be a string' if metadata_fname && !metadata_fname.is_a?(String)
|
|
92
|
+
|
|
93
|
+
File.open(audio_fname, 'wb') do |audio_file|
|
|
94
|
+
metadata_file = metadata_fname ? File.open(metadata_fname, 'w', encoding: 'utf-8') : nil
|
|
95
|
+
|
|
96
|
+
stream do |chunk|
|
|
97
|
+
if chunk.type == 'audio'
|
|
98
|
+
audio_file.write(chunk.data)
|
|
99
|
+
elsif metadata_file && %w[WordBoundary SentenceBoundary].include?(chunk.type)
|
|
100
|
+
metadata_file.puts(JSON.generate(chunk.to_h))
|
|
101
|
+
block.call(chunk) if block_given?
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
ensure
|
|
105
|
+
metadata_file&.close if metadata_file && metadata_file != audio_file
|
|
106
|
+
audio_file.close
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def save_sync(audio_fname, metadata_fname = nil, &block)
|
|
111
|
+
raise TypeError, 'audio_fname must be a string' unless audio_fname.is_a?(String)
|
|
112
|
+
raise TypeError, 'metadata_fname must be a string' if metadata_fname && !metadata_fname.is_a?(String)
|
|
113
|
+
|
|
114
|
+
Thread.new { save(audio_fname, metadata_fname, &block) }.join
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
def stream_internal
|
|
120
|
+
audio_was_received = false
|
|
121
|
+
@ws = nil
|
|
122
|
+
|
|
123
|
+
begin
|
|
124
|
+
EventMachine.run do
|
|
125
|
+
url = "#{RbEdgeTTS::WSS_URL}&ConnectionId=#{Util.connect_id}&Sec-MS-GEC=#{DRM.generate_sec_ms_gec}&Sec-MS-GEC-Version=#{RbEdgeTTS::SEC_MS_GEC_VERSION}"
|
|
126
|
+
|
|
127
|
+
options = {
|
|
128
|
+
headers: DRM.headers_with_muid(RbEdgeTTS::WSS_HEADERS),
|
|
129
|
+
tls: {
|
|
130
|
+
verify_peer: true,
|
|
131
|
+
ca_file: OpenSSL::X509::DEFAULT_CERT_FILE
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
@ws = Faye::WebSocket::Client.new(url, [], options)
|
|
136
|
+
|
|
137
|
+
@ws.on :open do |_event|
|
|
138
|
+
log 'WebSocket connection opened'
|
|
139
|
+
send_command_request(@ws)
|
|
140
|
+
send_ssml_request(@ws)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
@ws.on :message do |event|
|
|
144
|
+
handle_message(event.data) do |result|
|
|
145
|
+
if result.type == 'audio'
|
|
146
|
+
audio_was_received = true
|
|
147
|
+
end
|
|
148
|
+
yield result
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@ws.on :close do |event|
|
|
154
|
+
# 1006 is common after successful transmission
|
|
155
|
+
log "WebSocket connection closed: #{event.code} #{event.reason}" unless event.code == 1006
|
|
156
|
+
EventMachine.stop
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
@ws.on :error do |event|
|
|
160
|
+
# Ignore ECONNRESET as it often happens at the end of stream
|
|
161
|
+
log "WebSocket Error: #{event.message}" unless event.message.to_s.include?('ECONNRESET')
|
|
162
|
+
EventMachine.stop
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
EventMachine.add_timer(@receive_timeout) do
|
|
166
|
+
puts "Timeout: No response in #{@receive_timeout} seconds"
|
|
167
|
+
EventMachine.stop
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
rescue StandardError => e
|
|
171
|
+
raise unless e.message.include?('403')
|
|
172
|
+
|
|
173
|
+
DRM.handle_client_response_error(e)
|
|
174
|
+
retry
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def send_command_request(ws)
|
|
179
|
+
log 'Sending command request:'
|
|
180
|
+
request = DRM.command_request(@tts_config.boundary)
|
|
181
|
+
log request
|
|
182
|
+
ws.send(request)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def send_ssml_request(ws)
|
|
186
|
+
ssml = Util.mkssml(@tts_config, @state.partial_text)
|
|
187
|
+
|
|
188
|
+
request = "X-RequestId:#{Util.connect_id}\r\n" \
|
|
189
|
+
"Content-Type:application/ssml+xml\r\n" \
|
|
190
|
+
"X-Timestamp:#{Util.date_to_string}Z\r\n" \
|
|
191
|
+
"Path:ssml\r\n\r\n" \
|
|
192
|
+
"#{ssml}"
|
|
193
|
+
|
|
194
|
+
ws.send(request)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def handle_message(data, &block)
|
|
198
|
+
case data
|
|
199
|
+
when String
|
|
200
|
+
handle_text_message(data, &block)
|
|
201
|
+
when Array
|
|
202
|
+
handle_binary_message(data, &block)
|
|
203
|
+
else
|
|
204
|
+
handle_binary_message(data, &block)
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def handle_text_message(data, &block)
|
|
209
|
+
return if data.nil? || data.empty?
|
|
210
|
+
|
|
211
|
+
header_end = data.index("\r\n\r\n")
|
|
212
|
+
unless header_end
|
|
213
|
+
if data.length > 2
|
|
214
|
+
handle_binary_message(data.bytes, &block)
|
|
215
|
+
return
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
return unless header_end
|
|
219
|
+
|
|
220
|
+
headers = data[0...header_end]
|
|
221
|
+
body = data[(header_end + 4)..-1]
|
|
222
|
+
|
|
223
|
+
path = extract_header_value(headers, 'Path')
|
|
224
|
+
if path.nil?
|
|
225
|
+
if headers.include?('Path:audio')
|
|
226
|
+
handle_binary_message(data.bytes, &block)
|
|
227
|
+
return
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
return unless path
|
|
231
|
+
|
|
232
|
+
case path
|
|
233
|
+
when 'audio.metadata'
|
|
234
|
+
handle_metadata(body, &block)
|
|
235
|
+
update_last_duration_offset(body)
|
|
236
|
+
when 'audio'
|
|
237
|
+
handle_binary_message(data.bytes, &block)
|
|
238
|
+
when 'turn.end'
|
|
239
|
+
update_offset_compensation
|
|
240
|
+
@ws&.close
|
|
241
|
+
when 'response', 'turn.start', 'path', 'SessionEnd'
|
|
242
|
+
nil
|
|
243
|
+
else
|
|
244
|
+
raise UnknownResponse, "Unknown path received: #{path}"
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def handle_binary_message(data)
|
|
249
|
+
return if data.nil? || data.length < 2
|
|
250
|
+
|
|
251
|
+
header_length = (data[0] << 8) | data[1]
|
|
252
|
+
|
|
253
|
+
if header_length > data.length
|
|
254
|
+
raise UnexpectedResponse, 'The header length is greater than the length of the data.'
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
header_end = 2 + header_length
|
|
258
|
+
headers = data[2...header_end].pack('C*').force_encoding('utf-8')
|
|
259
|
+
body = data[header_end..-1].pack('C*')
|
|
260
|
+
|
|
261
|
+
path = extract_header_value(headers, 'Path')
|
|
262
|
+
|
|
263
|
+
raise UnexpectedResponse, "Received binary message, but the path is not audio: #{path}" if path != 'audio'
|
|
264
|
+
|
|
265
|
+
content_type = extract_header_value(headers, 'Content-Type')
|
|
266
|
+
|
|
267
|
+
if content_type && content_type != 'audio/mpeg'
|
|
268
|
+
raise UnexpectedResponse, "Received binary message, but with an unexpected Content-Type: #{content_type}"
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
return if content_type.nil? && body.nil?
|
|
272
|
+
|
|
273
|
+
return if body.nil? || body.empty?
|
|
274
|
+
|
|
275
|
+
yield TTSChunk.new(type: 'audio', data: body)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def handle_metadata(data)
|
|
279
|
+
return if data.nil? || data.empty?
|
|
280
|
+
|
|
281
|
+
begin
|
|
282
|
+
metadata = JSON.parse(data)
|
|
283
|
+
return unless metadata.is_a?(Hash) && metadata['Metadata'].is_a?(Array)
|
|
284
|
+
|
|
285
|
+
metadata['Metadata'].each do |meta_obj|
|
|
286
|
+
meta_type = meta_obj['Type']
|
|
287
|
+
next unless %w[WordBoundary SentenceBoundary].include?(meta_type)
|
|
288
|
+
|
|
289
|
+
data_obj = meta_obj['Data']
|
|
290
|
+
current_offset = (data_obj['Offset'] || 0) + @state.offset_compensation
|
|
291
|
+
current_duration = data_obj['Duration'] || 0
|
|
292
|
+
|
|
293
|
+
yield TTSChunk.new(
|
|
294
|
+
type: meta_type,
|
|
295
|
+
offset: current_offset,
|
|
296
|
+
duration: current_duration,
|
|
297
|
+
text: Util.unescape_xml(data_obj.dig('text', 'Text') || '')
|
|
298
|
+
)
|
|
299
|
+
end
|
|
300
|
+
rescue JSON::ParserError => e
|
|
301
|
+
puts "JSON parse error: #{e.message}"
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def extract_header_value(headers, key)
|
|
306
|
+
return nil unless headers.is_a?(String)
|
|
307
|
+
|
|
308
|
+
match = headers.match(/^#{Regexp.escape(key)}:([^\r\n]*)/i)
|
|
309
|
+
match ? match[1].strip : nil
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
def update_last_duration_offset(data)
|
|
313
|
+
metadata = JSON.parse(data)
|
|
314
|
+
return unless metadata.is_a?(Hash) && metadata['Metadata'].is_a?(Array)
|
|
315
|
+
|
|
316
|
+
metadata['Metadata'].each do |meta_obj|
|
|
317
|
+
next unless %w[WordBoundary SentenceBoundary].include?(meta_obj['Type'])
|
|
318
|
+
|
|
319
|
+
data_obj = meta_obj['Data']
|
|
320
|
+
@state.last_duration_offset = (data_obj['Offset'] || 0) + (data_obj['Duration'] || 0)
|
|
321
|
+
end
|
|
322
|
+
rescue JSON::ParserError
|
|
323
|
+
nil
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def update_offset_compensation
|
|
327
|
+
@state.offset_compensation = @state.last_duration_offset
|
|
328
|
+
@state.offset_compensation += 8_750_000
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
def log(message)
|
|
332
|
+
puts message if @verbose
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|