typecast-ruby 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1694bfd6415ffc2b46d014e34add996248c255c68ce5927c76e213bcf3fa86da
4
- data.tar.gz: afeef1970ae7e132bcd1438b942a25c834b5fe397f789e460c9dbf117bc6ab3f
3
+ metadata.gz: e7decb1a088b596c9055b4251ed588af80d3187d997ff0ba337859f479835c64
4
+ data.tar.gz: 22265da24aa5c1d9a85df19615cfe0e771cb60ebbcadf15f2157cc2ae2da0b9d
5
5
  SHA512:
6
- metadata.gz: f45bc9e47497cb6b7cd8d01a9d3ceaf767568b515e1ab6254ebf0d135d7c82cb4a743bb523dcf1bb227faa87963a0ab8a732ea36024e4deec70c53c307ae2c6f
7
- data.tar.gz: '0388fe2ac016617905bd89af4dfff027210705866a47012d0c544a822d3dd5dca43fb6dd6fe70cb45c05a90130a4f4f440b453e647423c162c8ec2a08eb8247f'
6
+ metadata.gz: 1a2024c96530643b47e3760d10a9b07afaf88cb50b7773420970c25e6678f5ac548be1ac15c6776430ab778a2b20370c9c2b03ad3df8938984d35269fe797dd9
7
+ data.tar.gz: dc4cdeda76890dfd668d519dac0f3c415907f6890c696b7b3fa24e3c255f546800e40246b743c822f56c8598e46fd0e6085987009b8540a26cef197971becbc1
@@ -4,6 +4,7 @@ require "securerandom"
4
4
  require "uri"
5
5
 
6
6
  require "typecast/errors"
7
+ require "typecast/composer"
7
8
  require "typecast/models"
8
9
  require "typecast/timestamps"
9
10
 
@@ -30,6 +31,26 @@ module Typecast
30
31
  )
31
32
  end
32
33
 
34
+ def compose_speech
35
+ SpeechComposer.new(method(:text_to_speech))
36
+ end
37
+
38
+ # Browse available API voices at https://typecast.ai/developers/api/voices.
39
+ def generate_to_file(path, text:, voice_id:, model: Models::TTS_MODEL_V30, language: nil, prompt: nil, output: nil, seed: nil)
40
+ request = Models::TTSRequest.new(
41
+ voice_id: voice_id,
42
+ text: text,
43
+ model: model,
44
+ language: language,
45
+ prompt: prompt,
46
+ output: output || inferred_output(path),
47
+ seed: seed
48
+ )
49
+ response = text_to_speech(request)
50
+ File.binwrite(path, response.audio_data)
51
+ response
52
+ end
53
+
33
54
  def text_to_speech_stream(request)
34
55
  response = request_json(:post, "/v1/text-to-speech/stream", request.to_h)
35
56
  return enum_for(:text_to_speech_stream, request) unless block_given?
@@ -81,6 +102,13 @@ module Typecast
81
102
 
82
103
  private
83
104
 
105
+ def inferred_output(path)
106
+ case File.extname(path.to_s).downcase
107
+ when ".mp3" then Models::Output.new(audio_format: Models::AUDIO_MP3)
108
+ when ".wav" then Models::Output.new(audio_format: Models::AUDIO_WAV)
109
+ end
110
+ end
111
+
84
112
  def request_json(method, path, body = nil, query = nil)
85
113
  headers = auth_headers.merge("Content-Type" => "application/json")
86
114
  request_raw(method, path, body.nil? ? nil : JSON.generate(body), headers, query)
@@ -0,0 +1,249 @@
1
+ require "stringio"
2
+
3
+ require "typecast/models"
4
+
5
+ module Typecast
6
+ PausePart = Struct.new(:kind, :seconds, keyword_init: true)
7
+ TextPart = Struct.new(:kind, :text, keyword_init: true)
8
+
9
+ PAUSE_TOKEN = /<\|(\d+(?:\.\d+)?)s\|>/.freeze
10
+
11
+ def self.parse_pause_markup(text)
12
+ parts = []
13
+ last_index = 0
14
+ text.to_s.scan(PAUSE_TOKEN) do |match|
15
+ match_data = Regexp.last_match
16
+ if match_data.begin(0) > last_index
17
+ parts << TextPart.new(kind: "text", text: text[last_index...match_data.begin(0)])
18
+ end
19
+ parts << PausePart.new(kind: "pause", seconds: match[0].to_f)
20
+ last_index = match_data.end(0)
21
+ end
22
+ if last_index < text.length
23
+ parts << TextPart.new(kind: "text", text: text[last_index..-1])
24
+ end
25
+ parts
26
+ end
27
+
28
+ class SpeechComposer
29
+ def initialize(text_to_speech)
30
+ @text_to_speech = text_to_speech
31
+ @defaults = {}
32
+ @parts = []
33
+ end
34
+
35
+ def defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
36
+ @defaults = merge_settings(@defaults, settings_hash(
37
+ voice_id: voice_id,
38
+ model: model,
39
+ language: language,
40
+ prompt: prompt,
41
+ output: output,
42
+ seed: seed
43
+ ))
44
+ self
45
+ end
46
+
47
+ def say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
48
+ @parts << {
49
+ kind: "speech",
50
+ text: text.to_s,
51
+ settings: merge_settings(@defaults, settings_hash(
52
+ voice_id: voice_id,
53
+ model: model,
54
+ language: language,
55
+ prompt: prompt,
56
+ output: output,
57
+ seed: seed
58
+ ))
59
+ }
60
+ self
61
+ end
62
+
63
+ # Inserts silence between speech segments.
64
+ #
65
+ # seconds is a duration in seconds. Use 0.3 for 300 ms, 3 for 3 seconds.
66
+ def pause(seconds)
67
+ unless seconds.is_a?(Numeric) && seconds.finite? && seconds.positive?
68
+ raise ArgumentError, "pause seconds must be greater than 0"
69
+ end
70
+
71
+ @parts << PausePart.new(kind: "pause", seconds: seconds.to_f)
72
+ self
73
+ end
74
+
75
+ def generate
76
+ plan = build_plan
77
+ unless plan.any? { |part| part.is_a?(Hash) && part[:kind] == "speech" }
78
+ raise ArgumentError, "at least one speech segment is required"
79
+ end
80
+
81
+ output_format = @defaults.dig(:output, :audio_format) || Models::AUDIO_WAV
82
+ unless [Models::AUDIO_WAV, Models::AUDIO_MP3].include?(output_format)
83
+ raise ArgumentError, "unsupported composed speech output format: #{output_format}"
84
+ end
85
+
86
+ wav_spec = nil
87
+ output_samples = []
88
+ plan.each do |part|
89
+ if part.is_a?(PausePart)
90
+ raise ArgumentError, "pause cannot be the first composed part" if wav_spec.nil?
91
+
92
+ output_samples.concat(Array.new(seconds_to_samples(part.seconds, wav_spec[:sample_rate]), 0))
93
+ next
94
+ end
95
+
96
+ response = @text_to_speech.call(request_from_settings(part[:text], part[:settings]))
97
+ wav = parse_wav(response.audio_data)
98
+ if wav_spec && wav[:spec] != wav_spec
99
+ raise ArgumentError, "all composed WAV segments must use the same PCM format"
100
+ end
101
+
102
+ wav_spec = wav[:spec]
103
+ output_samples.concat(trim_silence(wav[:samples]))
104
+ end
105
+
106
+ wav_data = encode_wav(output_samples, wav_spec)
107
+ raise ArgumentError, "ffmpeg is required to encode composed speech as mp3" if output_format == Models::AUDIO_MP3
108
+
109
+ Models::TTSResponse.new(
110
+ audio_data: wav_data,
111
+ duration: output_samples.length.to_f / wav_spec[:sample_rate],
112
+ format: Models::AUDIO_WAV
113
+ )
114
+ end
115
+
116
+ private
117
+
118
+ def build_plan
119
+ plan = []
120
+ @parts.each do |part|
121
+ if part.is_a?(PausePart)
122
+ plan << part
123
+ next
124
+ end
125
+
126
+ Typecast.parse_pause_markup(part[:text]).each do |parsed|
127
+ if parsed.is_a?(PausePart)
128
+ plan << parsed
129
+ next
130
+ end
131
+ next if parsed.text.strip.empty?
132
+
133
+ raise ArgumentError, "voice_id is required for composed speech segments" if part[:settings][:voice_id].to_s.empty?
134
+ raise ArgumentError, "model is required for composed speech segments" if part[:settings][:model].to_s.empty?
135
+
136
+ plan << { kind: "speech", text: parsed.text, settings: part[:settings] }
137
+ end
138
+ end
139
+ plan
140
+ end
141
+
142
+ def settings_hash(voice_id:, model:, language:, prompt:, output:, seed:)
143
+ {
144
+ voice_id: voice_id,
145
+ model: model,
146
+ language: language,
147
+ prompt: prompt,
148
+ output: output_hash(output),
149
+ seed: seed
150
+ }.reject { |_key, value| value.nil? }
151
+ end
152
+
153
+ def merge_settings(base, override)
154
+ merged = base.merge(override)
155
+ merged[:output] = merge_output(base[:output], override[:output])
156
+ merged.reject { |_key, value| value.nil? }
157
+ end
158
+
159
+ def merge_output(base, override)
160
+ return nil if base.nil? && override.nil?
161
+
162
+ (base || {}).merge(override || {})
163
+ end
164
+
165
+ def output_hash(output)
166
+ return nil if output.nil?
167
+ return output.to_h if output.respond_to?(:to_h)
168
+
169
+ output
170
+ end
171
+
172
+ def request_from_settings(text, settings)
173
+ output = merge_output(settings[:output], audio_format: Models::AUDIO_WAV)
174
+ Models::TTSRequest.new(
175
+ voice_id: settings[:voice_id],
176
+ text: text,
177
+ model: settings[:model],
178
+ language: settings[:language],
179
+ prompt: settings[:prompt],
180
+ output: Models::Output.new(**output),
181
+ seed: settings[:seed]
182
+ )
183
+ end
184
+
185
+ def parse_wav(data)
186
+ io = StringIO.new(data)
187
+ raise ArgumentError, "unsupported WAV data" unless io.read(4) == "RIFF"
188
+
189
+ io.read(4)
190
+ raise ArgumentError, "unsupported WAV data" unless io.read(4) == "WAVE"
191
+
192
+ spec = nil
193
+ samples = nil
194
+ until io.eof?
195
+ chunk_id = io.read(4)
196
+ break if chunk_id.nil? || chunk_id.bytesize < 4
197
+
198
+ chunk_size_bytes = io.read(4)
199
+ raise ArgumentError, "unsupported WAV data" if chunk_size_bytes.nil? || chunk_size_bytes.bytesize < 4
200
+
201
+ chunk_size = chunk_size_bytes.unpack1("V")
202
+ chunk_data = io.read(chunk_size)
203
+ io.read(1) if chunk_size.odd?
204
+ raise ArgumentError, "unsupported WAV data" if chunk_data.nil? || chunk_data.bytesize < chunk_size
205
+
206
+ case chunk_id
207
+ when "fmt "
208
+ audio_format, channels, sample_rate, _byte_rate, _block_align, bits_per_sample = chunk_data.unpack("vvVVvv")
209
+ if audio_format != 1 || channels != 1 || bits_per_sample != 16
210
+ raise ArgumentError, "only mono 16-bit PCM WAV is supported for composed speech"
211
+ end
212
+ spec = { sample_rate: sample_rate, channels: channels, bits_per_sample: bits_per_sample }
213
+ when "data"
214
+ samples = chunk_data.unpack("s<*")
215
+ end
216
+ end
217
+
218
+ raise ArgumentError, "unsupported WAV data" if spec.nil? || samples.nil?
219
+
220
+ { spec: spec, samples: samples }
221
+ end
222
+
223
+ def encode_wav(samples, spec)
224
+ payload = samples.pack("s<*")
225
+ [
226
+ "RIFF",
227
+ [36 + payload.bytesize].pack("V"),
228
+ "WAVE",
229
+ "fmt ",
230
+ [16, 1, spec[:channels], spec[:sample_rate], spec[:sample_rate] * spec[:channels] * 2, spec[:channels] * 2, spec[:bits_per_sample]].pack("VvvVVvv"),
231
+ "data",
232
+ [payload.bytesize].pack("V"),
233
+ payload
234
+ ].join
235
+ end
236
+
237
+ def trim_silence(samples)
238
+ start_index = 0
239
+ end_index = samples.length
240
+ start_index += 1 while start_index < end_index && samples[start_index].abs <= 0
241
+ end_index -= 1 while end_index > start_index && samples[end_index - 1].abs <= 0
242
+ samples[start_index...end_index] || []
243
+ end
244
+
245
+ def seconds_to_samples(seconds, sample_rate)
246
+ (seconds * sample_rate).round
247
+ end
248
+ end
249
+ end
@@ -90,6 +90,7 @@ module Typecast
90
90
  class TTSRequest
91
91
  attr_reader :voice_id, :text, :model, :language, :prompt, :output, :seed
92
92
 
93
+ # Browse available API voices at https://typecast.ai/developers/api/voices.
93
94
  def initialize(voice_id:, text:, model:, language: nil, prompt: nil, output: nil, seed: nil)
94
95
  @voice_id = voice_id
95
96
  @text = text
data/lib/typecast.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require "typecast/client"
2
+ require "typecast/composer"
2
3
  require "typecast/errors"
3
4
  require "typecast/models"
4
5
  require "typecast/timestamps"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: typecast-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Neosapience
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-15 00:00:00.000000000 Z
11
+ date: 2026-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -51,6 +51,7 @@ files:
51
51
  - THIRD-PARTY-LICENSES.md
52
52
  - lib/typecast.rb
53
53
  - lib/typecast/client.rb
54
+ - lib/typecast/composer.rb
54
55
  - lib/typecast/errors.rb
55
56
  - lib/typecast/models.rb
56
57
  - lib/typecast/timestamps.rb