typecast-ruby 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/typecast/client.rb +5 -0
- data/lib/typecast/composer.rb +249 -0
- data/lib/typecast.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e7decb1a088b596c9055b4251ed588af80d3187d997ff0ba337859f479835c64
|
|
4
|
+
data.tar.gz: 22265da24aa5c1d9a85df19615cfe0e771cb60ebbcadf15f2157cc2ae2da0b9d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1a2024c96530643b47e3760d10a9b07afaf88cb50b7773420970c25e6678f5ac548be1ac15c6776430ab778a2b20370c9c2b03ad3df8938984d35269fe797dd9
|
|
7
|
+
data.tar.gz: dc4cdeda76890dfd668d519dac0f3c415907f6890c696b7b3fa24e3c255f546800e40246b743c822f56c8598e46fd0e6085987009b8540a26cef197971becbc1
|
data/lib/typecast/client.rb
CHANGED
|
@@ -4,6 +4,7 @@ require "securerandom"
|
|
|
4
4
|
require "uri"
|
|
5
5
|
|
|
6
6
|
require "typecast/errors"
|
|
7
|
+
require "typecast/composer"
|
|
7
8
|
require "typecast/models"
|
|
8
9
|
require "typecast/timestamps"
|
|
9
10
|
|
|
@@ -30,6 +31,10 @@ module Typecast
|
|
|
30
31
|
)
|
|
31
32
|
end
|
|
32
33
|
|
|
34
|
+
def compose_speech
|
|
35
|
+
SpeechComposer.new(method(:text_to_speech))
|
|
36
|
+
end
|
|
37
|
+
|
|
33
38
|
# Browse available API voices at https://typecast.ai/developers/api/voices.
|
|
34
39
|
def generate_to_file(path, text:, voice_id:, model: Models::TTS_MODEL_V30, language: nil, prompt: nil, output: nil, seed: nil)
|
|
35
40
|
request = Models::TTSRequest.new(
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
require "stringio"
|
|
2
|
+
|
|
3
|
+
require "typecast/models"
|
|
4
|
+
|
|
5
|
+
module Typecast
|
|
6
|
+
PausePart = Struct.new(:kind, :seconds, keyword_init: true)
|
|
7
|
+
TextPart = Struct.new(:kind, :text, keyword_init: true)
|
|
8
|
+
|
|
9
|
+
PAUSE_TOKEN = /<\|(\d+(?:\.\d+)?)s\|>/.freeze
|
|
10
|
+
|
|
11
|
+
def self.parse_pause_markup(text)
|
|
12
|
+
parts = []
|
|
13
|
+
last_index = 0
|
|
14
|
+
text.to_s.scan(PAUSE_TOKEN) do |match|
|
|
15
|
+
match_data = Regexp.last_match
|
|
16
|
+
if match_data.begin(0) > last_index
|
|
17
|
+
parts << TextPart.new(kind: "text", text: text[last_index...match_data.begin(0)])
|
|
18
|
+
end
|
|
19
|
+
parts << PausePart.new(kind: "pause", seconds: match[0].to_f)
|
|
20
|
+
last_index = match_data.end(0)
|
|
21
|
+
end
|
|
22
|
+
if last_index < text.length
|
|
23
|
+
parts << TextPart.new(kind: "text", text: text[last_index..-1])
|
|
24
|
+
end
|
|
25
|
+
parts
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class SpeechComposer
|
|
29
|
+
def initialize(text_to_speech)
|
|
30
|
+
@text_to_speech = text_to_speech
|
|
31
|
+
@defaults = {}
|
|
32
|
+
@parts = []
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
|
|
36
|
+
@defaults = merge_settings(@defaults, settings_hash(
|
|
37
|
+
voice_id: voice_id,
|
|
38
|
+
model: model,
|
|
39
|
+
language: language,
|
|
40
|
+
prompt: prompt,
|
|
41
|
+
output: output,
|
|
42
|
+
seed: seed
|
|
43
|
+
))
|
|
44
|
+
self
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
|
|
48
|
+
@parts << {
|
|
49
|
+
kind: "speech",
|
|
50
|
+
text: text.to_s,
|
|
51
|
+
settings: merge_settings(@defaults, settings_hash(
|
|
52
|
+
voice_id: voice_id,
|
|
53
|
+
model: model,
|
|
54
|
+
language: language,
|
|
55
|
+
prompt: prompt,
|
|
56
|
+
output: output,
|
|
57
|
+
seed: seed
|
|
58
|
+
))
|
|
59
|
+
}
|
|
60
|
+
self
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Inserts silence between speech segments.
|
|
64
|
+
#
|
|
65
|
+
# seconds is a duration in seconds. Use 0.3 for 300 ms, 3 for 3 seconds.
|
|
66
|
+
def pause(seconds)
|
|
67
|
+
unless seconds.is_a?(Numeric) && seconds.finite? && seconds.positive?
|
|
68
|
+
raise ArgumentError, "pause seconds must be greater than 0"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
@parts << PausePart.new(kind: "pause", seconds: seconds.to_f)
|
|
72
|
+
self
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def generate
|
|
76
|
+
plan = build_plan
|
|
77
|
+
unless plan.any? { |part| part.is_a?(Hash) && part[:kind] == "speech" }
|
|
78
|
+
raise ArgumentError, "at least one speech segment is required"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
output_format = @defaults.dig(:output, :audio_format) || Models::AUDIO_WAV
|
|
82
|
+
unless [Models::AUDIO_WAV, Models::AUDIO_MP3].include?(output_format)
|
|
83
|
+
raise ArgumentError, "unsupported composed speech output format: #{output_format}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
wav_spec = nil
|
|
87
|
+
output_samples = []
|
|
88
|
+
plan.each do |part|
|
|
89
|
+
if part.is_a?(PausePart)
|
|
90
|
+
raise ArgumentError, "pause cannot be the first composed part" if wav_spec.nil?
|
|
91
|
+
|
|
92
|
+
output_samples.concat(Array.new(seconds_to_samples(part.seconds, wav_spec[:sample_rate]), 0))
|
|
93
|
+
next
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
response = @text_to_speech.call(request_from_settings(part[:text], part[:settings]))
|
|
97
|
+
wav = parse_wav(response.audio_data)
|
|
98
|
+
if wav_spec && wav[:spec] != wav_spec
|
|
99
|
+
raise ArgumentError, "all composed WAV segments must use the same PCM format"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
wav_spec = wav[:spec]
|
|
103
|
+
output_samples.concat(trim_silence(wav[:samples]))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
wav_data = encode_wav(output_samples, wav_spec)
|
|
107
|
+
raise ArgumentError, "ffmpeg is required to encode composed speech as mp3" if output_format == Models::AUDIO_MP3
|
|
108
|
+
|
|
109
|
+
Models::TTSResponse.new(
|
|
110
|
+
audio_data: wav_data,
|
|
111
|
+
duration: output_samples.length.to_f / wav_spec[:sample_rate],
|
|
112
|
+
format: Models::AUDIO_WAV
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def build_plan
|
|
119
|
+
plan = []
|
|
120
|
+
@parts.each do |part|
|
|
121
|
+
if part.is_a?(PausePart)
|
|
122
|
+
plan << part
|
|
123
|
+
next
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
Typecast.parse_pause_markup(part[:text]).each do |parsed|
|
|
127
|
+
if parsed.is_a?(PausePart)
|
|
128
|
+
plan << parsed
|
|
129
|
+
next
|
|
130
|
+
end
|
|
131
|
+
next if parsed.text.strip.empty?
|
|
132
|
+
|
|
133
|
+
raise ArgumentError, "voice_id is required for composed speech segments" if part[:settings][:voice_id].to_s.empty?
|
|
134
|
+
raise ArgumentError, "model is required for composed speech segments" if part[:settings][:model].to_s.empty?
|
|
135
|
+
|
|
136
|
+
plan << { kind: "speech", text: parsed.text, settings: part[:settings] }
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
plan
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def settings_hash(voice_id:, model:, language:, prompt:, output:, seed:)
|
|
143
|
+
{
|
|
144
|
+
voice_id: voice_id,
|
|
145
|
+
model: model,
|
|
146
|
+
language: language,
|
|
147
|
+
prompt: prompt,
|
|
148
|
+
output: output_hash(output),
|
|
149
|
+
seed: seed
|
|
150
|
+
}.reject { |_key, value| value.nil? }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def merge_settings(base, override)
|
|
154
|
+
merged = base.merge(override)
|
|
155
|
+
merged[:output] = merge_output(base[:output], override[:output])
|
|
156
|
+
merged.reject { |_key, value| value.nil? }
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def merge_output(base, override)
|
|
160
|
+
return nil if base.nil? && override.nil?
|
|
161
|
+
|
|
162
|
+
(base || {}).merge(override || {})
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def output_hash(output)
|
|
166
|
+
return nil if output.nil?
|
|
167
|
+
return output.to_h if output.respond_to?(:to_h)
|
|
168
|
+
|
|
169
|
+
output
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def request_from_settings(text, settings)
|
|
173
|
+
output = merge_output(settings[:output], audio_format: Models::AUDIO_WAV)
|
|
174
|
+
Models::TTSRequest.new(
|
|
175
|
+
voice_id: settings[:voice_id],
|
|
176
|
+
text: text,
|
|
177
|
+
model: settings[:model],
|
|
178
|
+
language: settings[:language],
|
|
179
|
+
prompt: settings[:prompt],
|
|
180
|
+
output: Models::Output.new(**output),
|
|
181
|
+
seed: settings[:seed]
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def parse_wav(data)
|
|
186
|
+
io = StringIO.new(data)
|
|
187
|
+
raise ArgumentError, "unsupported WAV data" unless io.read(4) == "RIFF"
|
|
188
|
+
|
|
189
|
+
io.read(4)
|
|
190
|
+
raise ArgumentError, "unsupported WAV data" unless io.read(4) == "WAVE"
|
|
191
|
+
|
|
192
|
+
spec = nil
|
|
193
|
+
samples = nil
|
|
194
|
+
until io.eof?
|
|
195
|
+
chunk_id = io.read(4)
|
|
196
|
+
break if chunk_id.nil? || chunk_id.bytesize < 4
|
|
197
|
+
|
|
198
|
+
chunk_size_bytes = io.read(4)
|
|
199
|
+
raise ArgumentError, "unsupported WAV data" if chunk_size_bytes.nil? || chunk_size_bytes.bytesize < 4
|
|
200
|
+
|
|
201
|
+
chunk_size = chunk_size_bytes.unpack1("V")
|
|
202
|
+
chunk_data = io.read(chunk_size)
|
|
203
|
+
io.read(1) if chunk_size.odd?
|
|
204
|
+
raise ArgumentError, "unsupported WAV data" if chunk_data.nil? || chunk_data.bytesize < chunk_size
|
|
205
|
+
|
|
206
|
+
case chunk_id
|
|
207
|
+
when "fmt "
|
|
208
|
+
audio_format, channels, sample_rate, _byte_rate, _block_align, bits_per_sample = chunk_data.unpack("vvVVvv")
|
|
209
|
+
if audio_format != 1 || channels != 1 || bits_per_sample != 16
|
|
210
|
+
raise ArgumentError, "only mono 16-bit PCM WAV is supported for composed speech"
|
|
211
|
+
end
|
|
212
|
+
spec = { sample_rate: sample_rate, channels: channels, bits_per_sample: bits_per_sample }
|
|
213
|
+
when "data"
|
|
214
|
+
samples = chunk_data.unpack("s<*")
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
raise ArgumentError, "unsupported WAV data" if spec.nil? || samples.nil?
|
|
219
|
+
|
|
220
|
+
{ spec: spec, samples: samples }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def encode_wav(samples, spec)
|
|
224
|
+
payload = samples.pack("s<*")
|
|
225
|
+
[
|
|
226
|
+
"RIFF",
|
|
227
|
+
[36 + payload.bytesize].pack("V"),
|
|
228
|
+
"WAVE",
|
|
229
|
+
"fmt ",
|
|
230
|
+
[16, 1, spec[:channels], spec[:sample_rate], spec[:sample_rate] * spec[:channels] * 2, spec[:channels] * 2, spec[:bits_per_sample]].pack("VvvVVvv"),
|
|
231
|
+
"data",
|
|
232
|
+
[payload.bytesize].pack("V"),
|
|
233
|
+
payload
|
|
234
|
+
].join
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def trim_silence(samples)
|
|
238
|
+
start_index = 0
|
|
239
|
+
end_index = samples.length
|
|
240
|
+
start_index += 1 while start_index < end_index && samples[start_index].abs <= 0
|
|
241
|
+
end_index -= 1 while end_index > start_index && samples[end_index - 1].abs <= 0
|
|
242
|
+
samples[start_index...end_index] || []
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def seconds_to_samples(seconds, sample_rate)
|
|
246
|
+
(seconds * sample_rate).round
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
data/lib/typecast.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: typecast-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Neosapience
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-06-
|
|
11
|
+
date: 2026-06-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: minitest
|
|
@@ -51,6 +51,7 @@ files:
|
|
|
51
51
|
- THIRD-PARTY-LICENSES.md
|
|
52
52
|
- lib/typecast.rb
|
|
53
53
|
- lib/typecast/client.rb
|
|
54
|
+
- lib/typecast/composer.rb
|
|
54
55
|
- lib/typecast/errors.rb
|
|
55
56
|
- lib/typecast/models.rb
|
|
56
57
|
- lib/typecast/timestamps.rb
|