RubyGems - typecast-ruby - Versions diffs - 0.1.1 → 0.1.3 - Mend

typecast-ruby 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1694bfd6415ffc2b46d014e34add996248c255c68ce5927c76e213bcf3fa86da
-  data.tar.gz: afeef1970ae7e132bcd1438b942a25c834b5fe397f789e460c9dbf117bc6ab3f
+  metadata.gz: e7decb1a088b596c9055b4251ed588af80d3187d997ff0ba337859f479835c64
+  data.tar.gz: 22265da24aa5c1d9a85df19615cfe0e771cb60ebbcadf15f2157cc2ae2da0b9d
 SHA512:
-  metadata.gz: f45bc9e47497cb6b7cd8d01a9d3ceaf767568b515e1ab6254ebf0d135d7c82cb4a743bb523dcf1bb227faa87963a0ab8a732ea36024e4deec70c53c307ae2c6f
-  data.tar.gz: '0388fe2ac016617905bd89af4dfff027210705866a47012d0c544a822d3dd5dca43fb6dd6fe70cb45c05a90130a4f4f440b453e647423c162c8ec2a08eb8247f'
+  metadata.gz: 1a2024c96530643b47e3760d10a9b07afaf88cb50b7773420970c25e6678f5ac548be1ac15c6776430ab778a2b20370c9c2b03ad3df8938984d35269fe797dd9
+  data.tar.gz: dc4cdeda76890dfd668d519dac0f3c415907f6890c696b7b3fa24e3c255f546800e40246b743c822f56c8598e46fd0e6085987009b8540a26cef197971becbc1

data/lib/typecast/client.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require "securerandom"
 require "uri"
 require "typecast/errors"
+require "typecast/composer"
 require "typecast/models"
 require "typecast/timestamps"
@@ -30,6 +31,26 @@ module Typecast
       )
     end
+    def compose_speech
+      SpeechComposer.new(method(:text_to_speech))
+    end
+    # Browse available API voices at https://typecast.ai/developers/api/voices.
+    def generate_to_file(path, text:, voice_id:, model: Models::TTS_MODEL_V30, language: nil, prompt: nil, output: nil, seed: nil)
+      request = Models::TTSRequest.new(
+        voice_id: voice_id,
+        text: text,
+        model: model,
+        language: language,
+        prompt: prompt,
+        output: output || inferred_output(path),
+        seed: seed
+      )
+      response = text_to_speech(request)
+      File.binwrite(path, response.audio_data)
+      response
+    end
     def text_to_speech_stream(request)
       response = request_json(:post, "/v1/text-to-speech/stream", request.to_h)
       return enum_for(:text_to_speech_stream, request) unless block_given?
@@ -81,6 +102,13 @@ module Typecast
     private
+    def inferred_output(path)
+      case File.extname(path.to_s).downcase
+      when ".mp3" then Models::Output.new(audio_format: Models::AUDIO_MP3)
+      when ".wav" then Models::Output.new(audio_format: Models::AUDIO_WAV)
+      end
+    end
     def request_json(method, path, body = nil, query = nil)
       headers = auth_headers.merge("Content-Type" => "application/json")
       request_raw(method, path, body.nil? ? nil : JSON.generate(body), headers, query)

data/lib/typecast/composer.rb ADDED Viewed

@@ -0,0 +1,249 @@
+require "stringio"
+require "typecast/models"
+module Typecast
+  PausePart = Struct.new(:kind, :seconds, keyword_init: true)
+  TextPart = Struct.new(:kind, :text, keyword_init: true)
+  PAUSE_TOKEN = /<\|(\d+(?:\.\d+)?)s\|>/.freeze
+  def self.parse_pause_markup(text)
+    parts = []
+    last_index = 0
+    text.to_s.scan(PAUSE_TOKEN) do |match|
+      match_data = Regexp.last_match
+      if match_data.begin(0) > last_index
+        parts << TextPart.new(kind: "text", text: text[last_index...match_data.begin(0)])
+      end
+      parts << PausePart.new(kind: "pause", seconds: match[0].to_f)
+      last_index = match_data.end(0)
+    end
+    if last_index < text.length
+      parts << TextPart.new(kind: "text", text: text[last_index..-1])
+    end
+    parts
+  end
+  class SpeechComposer
+    def initialize(text_to_speech)
+      @text_to_speech = text_to_speech
+      @defaults = {}
+      @parts = []
+    end
+    def defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
+      @defaults = merge_settings(@defaults, settings_hash(
+        voice_id: voice_id,
+        model: model,
+        language: language,
+        prompt: prompt,
+        output: output,
+        seed: seed
+      ))
+      self
+    end
+    def say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
+      @parts << {
+        kind: "speech",
+        text: text.to_s,
+        settings: merge_settings(@defaults, settings_hash(
+          voice_id: voice_id,
+          model: model,
+          language: language,
+          prompt: prompt,
+          output: output,
+          seed: seed
+        ))
+      }
+      self
+    end
+    # Inserts silence between speech segments.
+    #
+    # seconds is a duration in seconds. Use 0.3 for 300 ms, 3 for 3 seconds.
+    def pause(seconds)
+      unless seconds.is_a?(Numeric) && seconds.finite? && seconds.positive?
+        raise ArgumentError, "pause seconds must be greater than 0"
+      end
+      @parts << PausePart.new(kind: "pause", seconds: seconds.to_f)
+      self
+    end
+    def generate
+      plan = build_plan
+      unless plan.any? { |part| part.is_a?(Hash) && part[:kind] == "speech" }
+        raise ArgumentError, "at least one speech segment is required"
+      end
+      output_format = @defaults.dig(:output, :audio_format) || Models::AUDIO_WAV
+      unless [Models::AUDIO_WAV, Models::AUDIO_MP3].include?(output_format)
+        raise ArgumentError, "unsupported composed speech output format: #{output_format}"
+      end
+      wav_spec = nil
+      output_samples = []
+      plan.each do |part|
+        if part.is_a?(PausePart)
+          raise ArgumentError, "pause cannot be the first composed part" if wav_spec.nil?
+          output_samples.concat(Array.new(seconds_to_samples(part.seconds, wav_spec[:sample_rate]), 0))
+          next
+        end
+        response = @text_to_speech.call(request_from_settings(part[:text], part[:settings]))
+        wav = parse_wav(response.audio_data)
+        if wav_spec && wav[:spec] != wav_spec
+          raise ArgumentError, "all composed WAV segments must use the same PCM format"
+        end
+        wav_spec = wav[:spec]
+        output_samples.concat(trim_silence(wav[:samples]))
+      end
+      wav_data = encode_wav(output_samples, wav_spec)
+      raise ArgumentError, "ffmpeg is required to encode composed speech as mp3" if output_format == Models::AUDIO_MP3
+      Models::TTSResponse.new(
+        audio_data: wav_data,
+        duration: output_samples.length.to_f / wav_spec[:sample_rate],
+        format: Models::AUDIO_WAV
+      )
+    end
+    private
+    def build_plan
+      plan = []
+      @parts.each do |part|
+        if part.is_a?(PausePart)
+          plan << part
+          next
+        end
+        Typecast.parse_pause_markup(part[:text]).each do |parsed|
+          if parsed.is_a?(PausePart)
+            plan << parsed
+            next
+          end
+          next if parsed.text.strip.empty?
+          raise ArgumentError, "voice_id is required for composed speech segments" if part[:settings][:voice_id].to_s.empty?
+          raise ArgumentError, "model is required for composed speech segments" if part[:settings][:model].to_s.empty?
+          plan << { kind: "speech", text: parsed.text, settings: part[:settings] }
+        end
+      end
+      plan
+    end
+    def settings_hash(voice_id:, model:, language:, prompt:, output:, seed:)
+      {
+        voice_id: voice_id,
+        model: model,
+        language: language,
+        prompt: prompt,
+        output: output_hash(output),
+        seed: seed
+      }.reject { |_key, value| value.nil? }
+    end
+    def merge_settings(base, override)
+      merged = base.merge(override)
+      merged[:output] = merge_output(base[:output], override[:output])
+      merged.reject { |_key, value| value.nil? }
+    end
+    def merge_output(base, override)
+      return nil if base.nil? && override.nil?
+      (base || {}).merge(override || {})
+    end
+    def output_hash(output)
+      return nil if output.nil?
+      return output.to_h if output.respond_to?(:to_h)
+      output
+    end
+    def request_from_settings(text, settings)
+      output = merge_output(settings[:output], audio_format: Models::AUDIO_WAV)
+      Models::TTSRequest.new(
+        voice_id: settings[:voice_id],
+        text: text,
+        model: settings[:model],
+        language: settings[:language],
+        prompt: settings[:prompt],
+        output: Models::Output.new(**output),
+        seed: settings[:seed]
+      )
+    end
+    def parse_wav(data)
+      io = StringIO.new(data)
+      raise ArgumentError, "unsupported WAV data" unless io.read(4) == "RIFF"
+      io.read(4)
+      raise ArgumentError, "unsupported WAV data" unless io.read(4) == "WAVE"
+      spec = nil
+      samples = nil
+      until io.eof?
+        chunk_id = io.read(4)
+        break if chunk_id.nil? || chunk_id.bytesize < 4
+        chunk_size_bytes = io.read(4)
+        raise ArgumentError, "unsupported WAV data" if chunk_size_bytes.nil? || chunk_size_bytes.bytesize < 4
+        chunk_size = chunk_size_bytes.unpack1("V")
+        chunk_data = io.read(chunk_size)
+        io.read(1) if chunk_size.odd?
+        raise ArgumentError, "unsupported WAV data" if chunk_data.nil? || chunk_data.bytesize < chunk_size
+        case chunk_id
+        when "fmt "
+          audio_format, channels, sample_rate, _byte_rate, _block_align, bits_per_sample = chunk_data.unpack("vvVVvv")
+          if audio_format != 1 || channels != 1 || bits_per_sample != 16
+            raise ArgumentError, "only mono 16-bit PCM WAV is supported for composed speech"
+          end
+          spec = { sample_rate: sample_rate, channels: channels, bits_per_sample: bits_per_sample }
+        when "data"
+          samples = chunk_data.unpack("s<*")
+        end
+      end
+      raise ArgumentError, "unsupported WAV data" if spec.nil? || samples.nil?
+      { spec: spec, samples: samples }
+    end
+    def encode_wav(samples, spec)
+      payload = samples.pack("s<*")
+      [
+        "RIFF",
+        [36 + payload.bytesize].pack("V"),
+        "WAVE",
+        "fmt ",
+        [16, 1, spec[:channels], spec[:sample_rate], spec[:sample_rate] * spec[:channels] * 2, spec[:channels] * 2, spec[:bits_per_sample]].pack("VvvVVvv"),
+        "data",
+        [payload.bytesize].pack("V"),
+        payload
+      ].join
+    end
+    def trim_silence(samples)
+      start_index = 0
+      end_index = samples.length
+      start_index += 1 while start_index < end_index && samples[start_index].abs <= 0
+      end_index -= 1 while end_index > start_index && samples[end_index - 1].abs <= 0
+      samples[start_index...end_index] || []
+    end
+    def seconds_to_samples(seconds, sample_rate)
+      (seconds * sample_rate).round
+    end
+  end
+end

data/lib/typecast/models.rb CHANGED Viewed

@@ -90,6 +90,7 @@ module Typecast
     class TTSRequest
       attr_reader :voice_id, :text, :model, :language, :prompt, :output, :seed
+      # Browse available API voices at https://typecast.ai/developers/api/voices.
       def initialize(voice_id:, text:, model:, language: nil, prompt: nil, output: nil, seed: nil)
         @voice_id = voice_id
         @text = text

data/lib/typecast.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require "typecast/client"
+require "typecast/composer"
 require "typecast/errors"
 require "typecast/models"
 require "typecast/timestamps"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: typecast-ruby
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.3
 platform: ruby
 authors:
 - Neosapience
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-06-15 00:00:00.000000000 Z
+date: 2026-06-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -51,6 +51,7 @@ files:
 - THIRD-PARTY-LICENSES.md
 - lib/typecast.rb
 - lib/typecast/client.rb
+- lib/typecast/composer.rb
 - lib/typecast/errors.rb
 - lib/typecast/models.rb
 - lib/typecast/timestamps.rb