RubyGems - ruby_speech - Versions diffs - 0.1.0 - Mend

ruby_speech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/.gitignore +7 -0
data/.rspec +3 -0
data/CHANGELOG.md +2 -0
data/Gemfile +4 -0
data/LICENSE.md +20 -0
data/README.md +76 -0
data/Rakefile +22 -0
data/assets/synthesis-core.xsd +442 -0
data/assets/synthesis.xsd +63 -0
data/assets/xml.xsd +287 -0
data/lib/ruby_speech.rb +15 -0
data/lib/ruby_speech/ssml.rb +24 -0
data/lib/ruby_speech/ssml/break.rb +71 -0
data/lib/ruby_speech/ssml/element.rb +26 -0
data/lib/ruby_speech/ssml/emphasis.rb +53 -0
data/lib/ruby_speech/ssml/prosody.rb +180 -0
data/lib/ruby_speech/ssml/say_as.rb +109 -0
data/lib/ruby_speech/ssml/speak.rb +57 -0
data/lib/ruby_speech/ssml/voice.rb +125 -0
data/lib/ruby_speech/version.rb +3 -0
data/lib/ruby_speech/xml.rb +7 -0
data/lib/ruby_speech/xml/language.rb +13 -0
data/ruby_speech.gemspec +31 -0
data/spec/ruby_speech/ssml/break_spec.rb +85 -0
data/spec/ruby_speech/ssml/emphasis_spec.rb +100 -0
data/spec/ruby_speech/ssml/prosody_spec.rb +286 -0
data/spec/ruby_speech/ssml/say_as_spec.rb +61 -0
data/spec/ruby_speech/ssml/speak_spec.rb +123 -0
data/spec/ruby_speech/ssml/voice_spec.rb +188 -0
data/spec/ruby_speech/ssml_spec.rb +65 -0
data/spec/spec_helper.rb +20 -0
data/spec/support/matchers.rb +45 -0
metadata +232 -0

data/lib/ruby_speech/ssml/element.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module RubySpeech
+  module SSML
+    class Element < Niceogiri::XML::Node
+      def self.new(element_name, atts = {}, &block)
+        super element_name do |new_node|
+          atts.each_pair { |k, v| new_node.send :"#{k}=", v }
+          block_return = new_node.instance_eval &block if block_given?
+          new_node << block_return if block_return.is_a?(String)
+        end
+      end
+      def method_missing(method_name, *args, &block)
+        const = SSML.const_get(method_name.to_s.titleize.gsub(' ', ''))
+        if const && self.valid_child_type?(const)
+          self << const.new(*args, &block)
+        else
+          super
+        end
+      end
+      def eql?(o, *args)
+        super o, :content, *args
+      end
+    end # Element
+  end # SSML
+end # RubySpeech

data/lib/ruby_speech/ssml/emphasis.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module RubySpeech
+  module SSML
+    ##
+    # The emphasis element requests that the contained text be spoken with emphasis (also referred to as prominence or stress). The synthesis processor determines how to render emphasis since the nature of emphasis differs between languages, dialects or even voices.
+    #
+    # http://www.w3.org/TR/speech-synthesis/#S3.2.2
+    #
+    class Emphasis < Element
+      VALID_LEVELS = [:strong, :moderate, :none, :reduced].freeze
+      VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
+      ##
+      # Create a new SSML emphasis element
+      #
+      # @param [Hash] atts Key-value pairs of options mapping to setter methods
+      #
+      # @return [Emphasis] an element for use in an SSML document
+      #
+      def self.new(atts = {}, &block)
+        super 'emphasis', atts, &block
+      end
+      ##
+      # Indicates the strength of emphasis to be applied. Defined values are "strong", "moderate", "none" and "reduced". The default level is "moderate". The meaning of "strong" and "moderate" emphasis is interpreted according to the language being spoken (languages indicate emphasis using a possible combination of pitch change, timing changes, loudness and other acoustic differences). The "reduced" level is effectively the opposite of emphasizing a word. For example, when the phrase "going to" is reduced it may be spoken as "gonna". The "none" level is used to prevent the synthesis processor from emphasizing words that it might typically emphasize. The values "none", "moderate", and "strong" are monotonically non-decreasing in strength.
+      #
+      # @return [Symbol]
+      #
+      def level
+        read_attr :level, :to_sym
+      end
+      ##
+      # @param [Symbol] l the level. Must be one of VALID_LEVELS
+      #
+      # @raises ArgumentError if l is not one of VALID_LEVELS
+      #
+      def level=(l)
+        raise ArgumentError, "You must specify a valid level (#{VALID_LEVELS.map(&:inspect).join ', '})" unless VALID_LEVELS.include? l
+        write_attr :level, l
+      end
+      def <<(arg)
+        raise InvalidChildError, "An Emphasis can only accept String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, SayAs, Sub, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
+        super
+      end
+      def eql?(o)
+        super o, :level
+      end
+    end # Emphasis
+  end # SSML
+end # RubySpeech

data/lib/ruby_speech/ssml/prosody.rb ADDED Viewed

@@ -0,0 +1,180 @@
+module RubySpeech
+  module SSML
+    ##
+    # The prosody element permits control of the pitch, speaking rate and volume of the speech output.
+    #
+    # http://www.w3.org/TR/speech-synthesis/#S3.2.4
+    #
+    # Although each attribute individually is optional, it is an error if no attributes are specified when the prosody element is used. The "x-foo" attribute value names are intended to be mnemonics for "extra foo". Note also that customary pitch levels and standard pitch ranges may vary significantly by language, as may the meanings of the labelled values for pitch targets and ranges.
+    #
+    # The duration attribute takes precedence over the rate attribute. The contour attribute takes precedence over the pitch and range attributes.
+    #
+    # The default value of all prosodic attributes is no change. For example, omitting the rate attribute means that the rate is the same within the element as outside.
+    #
+    class Prosody < Element
+      VALID_PITCHES     = [:'x-low', :low, :medium, :high, :'x-high', :default].freeze
+      VALID_VOLUMES     = [:silent, :'x-soft', :soft, :medium, :loud, :'x-loud', :default].freeze
+      VALID_RATES       = [:'x-slow', :slow, :medium, :fast, :'x-fast', :default].freeze
+      VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
+      ##
+      # Create a new SSML prosody element
+      #
+      # @param [Hash] atts Key-value pairs of options mapping to setter methods
+      #
+      # @return [Prosody] an element for use in an SSML document
+      #
+      def self.new(atts = {}, &block)
+        super 'prosody', atts, &block
+      end
+      ##
+      # The baseline pitch for the contained text. Although the exact meaning of "baseline pitch" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the approximate pitch of the output. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch levels.
+      #
+      # @return [Symbol, String]
+      #
+      def pitch
+        value = read_attr :pitch
+        return unless value
+        if value.include?('Hz')
+          value
+        elsif VALID_PITCHES.include?(value.to_sym)
+          value.to_sym
+        end
+      end
+      ##
+      # @param [Symbol, String] p
+      #
+      # @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
+      #
+      def pitch=(p)
+        hz = p.is_a?(String) && p.include?('Hz') && p.to_f > 0
+        raise ArgumentError, "You must specify a valid pitch (\"[positive-number]Hz\", #{VALID_PITCHES.map(&:inspect).join ', '})" unless hz || VALID_PITCHES.include?(p)
+        write_attr :pitch, p
+      end
+      ##
+      # The actual pitch contour for the contained text.
+      #
+      # The pitch contour is defined as a set of white space-separated targets at specified time positions in the speech output. The algorithm for interpolating between the targets is processor-specific. In each pair of the form (time position,target), the first value is a percentage of the period of the contained text (a number followed by "%") and the second value is the value of the pitch attribute (a number followed by "Hz", a relative change, or a label value). Time position values outside 0% to 100% are ignored. If a pitch value is not defined for 0% or 100% then the nearest pitch target is copied. All relative values for the pitch are relative to the pitch value just before the contained text.
+      #
+      # @return [Symbol]
+      #
+      def contour
+        read_attr :contour
+      end
+      ##
+      # @param [String] v
+      #
+      def contour=(v)
+        write_attr :contour, v
+      end
+      ##
+      # The pitch range (variability) for the contained text. Although the exact meaning of "pitch range" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the dynamic range of the output pitch. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch ranges.
+      #
+      # @return [Symbol]
+      #
+      def range
+        value = read_attr :range
+        return unless value
+        if value.include?('Hz')
+          value
+        elsif VALID_PITCHES.include?(value.to_sym)
+          value.to_sym
+        end
+      end
+      ##
+      # @param [Symbol, String] p
+      #
+      # @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
+      #
+      def range=(p)
+        hz = p.is_a?(String) && p.include?('Hz') && p.to_f > 0
+        raise ArgumentError, "You must specify a valid range (\"[positive-number]Hz\", #{VALID_PITCHES.map(&:inspect).join ', '})" unless hz || VALID_PITCHES.include?(p)
+        write_attr :range, p
+      end
+      ##
+      # A change in the speaking rate for the contained text. Legal values are: a relative change or "x-slow", "slow", "medium", "fast", "x-fast", or "default". Labels "x-slow" through "x-fast" represent a sequence of monotonically non-decreasing speaking rates. When a number is used to specify a relative change it acts as a multiplier of the default rate. For example, a value of 1 means no change in speaking rate, a value of 2 means a speaking rate twice the default rate, and a value of 0.5 means a speaking rate of half the default rate. The default rate for a voice depends on the language and dialect and on the personality of the voice. The default rate for a voice should be such that it is experienced as a normal speaking rate for the voice when reading aloud text. Since voices are processor-specific, the default rate will be as well.
+      #
+      # @return [Symbol, Float]
+      #
+      def rate
+        value = read_attr :rate
+        return unless value
+        if VALID_RATES.include?(value.to_sym)
+          value.to_sym
+        else
+          value.to_f
+        end
+      end
+      ##
+      # @param [Symbol, Numeric] v
+      #
+      # @raises ArgumentError if v is not either a positive Numeric or one of VALID_RATES
+      #
+      def rate=(v)
+        raise ArgumentError, "You must specify a valid rate ([positive-number](multiplier), #{VALID_RATES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && v >= 0) || VALID_RATES.include?(v)
+        write_attr :rate, v
+      end
+      ##
+      # A value in seconds for the desired time to take to read the element contents.
+      #
+      # @return [Integer]
+      #
+      def duration
+        read_attr :duration, :to_i
+      end
+      ##
+      # @param [Numeric] t
+      #
+      # @raises ArgumentError if t is not a positive numeric value
+      #
+      def duration=(t)
+        raise ArgumentError, "You must specify a valid duration (positive float value in seconds)" unless t.is_a?(Numeric) && t >= 0
+        write_attr :duration, "#{t}s"
+      end
+      ##
+      # The volume for the contained text in the range 0.0 to 100.0 (higher values are louder and specifying a value of zero is equivalent to specifying "silent"). Legal values are: number, a relative change or "silent", "x-soft", "soft", "medium", "loud", "x-loud", or "default". The volume scale is linear amplitude. The default is 100.0. Labels "silent" through "x-loud" represent a sequence of monotonically non-decreasing volume levels.
+      #
+      # @return [Symbol, Float]
+      #
+      def volume
+        value = read_attr :volume
+        if VALID_VOLUMES.include?(value.to_sym)
+          value.to_sym
+        else
+          value.to_f
+        end
+      end
+      ##
+      # @param [Numeric, Symbol] v
+      #
+      # @raises ArgumentError if v is not one of VALID_VOLUMES or a numeric value between 0.0 and 100.0
+      #
+      def volume=(v)
+        raise ArgumentError, "You must specify a valid volume ([positive-number](0.0 -> 100.0), #{VALID_VOLUMES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && (0..100).include?(v)) || VALID_VOLUMES.include?(v)
+        write_attr :volume, v
+      end
+      def <<(arg)
+        raise InvalidChildError, "A Prosody can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
+        super
+      end
+      def eql?(o)
+        super o, :pitch, :contour, :range, :rate, :duration, :volume
+      end
+    end # Prosody
+  end # SSML
+end # RubySpeech

data/lib/ruby_speech/ssml/say_as.rb ADDED Viewed

@@ -0,0 +1,109 @@
+module RubySpeech
+  module SSML
+    ##
+    # The say-as element allows the author to indicate information on the type of text construct contained within the element and to help specify the level of detail for rendering the contained text.
+    #
+    # http://www.w3.org/TR/speech-synthesis/#S3.1.8
+    #
+    # Defining a comprehensive set of text format types is difficult because of the variety of languages that have to be considered and because of the innate flexibility of written languages. SSML only specifies the say-as element, its attributes, and their purpose. It does not enumerate the possible values for the attributes. The Working Group expects to produce a separate document that will define standard values and associated normative behavior for these values. Examples given here are only for illustrating the purpose of the element and the attributes.
+    #
+    # The say-as element has three attributes: interpret-as, format, and detail. The interpret-as attribute is always required; the other two attributes are optional. The legal values for the format attribute depend on the value of the interpret-as attribute.
+    #
+    # The say-as element can only contain text to be rendered.
+    #
+    # When specified, the interpret-as and format values are to be interpreted by the synthesis processor as hints provided by the markup document author to aid text normalization and pronunciation.
+    #
+    # In all cases, the text enclosed by any say-as element is intended to be a standard, orthographic form of the language currently in context. A synthesis processor should be able to support the common, orthographic forms of the specified language for every content type that it supports.
+    #
+    # When the content of the say-as element contains additional text next to the content that is in the indicated format and interpret-as type, then this additional text must be rendered. The processor may make the rendering of the additional text dependent on the interpret-as type of the element in which it appears.
+    # When the content of the say-as element contains no content in the indicated interpret-as type or format, the processor must render the content either as if the format attribute were not present, or as if the interpret-as attribute were not present, or as if neither the format nor interpret-as attributes were present. The processor should also notify the environment of the mismatch.
+    #
+    # Indicating the content type or format does not necessarily affect the way the information is pronounced. A synthesis processor should pronounce the contained text in a manner in which such content is normally produced for the language.
+    #
+    class SayAs < Element
+      VALID_CHILD_TYPES = [String].freeze
+      ##
+      # Create a new SSML say-as element
+      #
+      # @param [Hash] atts Key-value pairs of options mapping to setter methods
+      #
+      # @return [Prosody] an element for use in an SSML document
+      #
+      def self.new(interpret_as, atts = {}, &block)
+        super 'say-as', atts.merge(interpret_as: interpret_as), &block
+      end
+      ##
+      #
+      # The interpret-as attribute indicates the content type of the contained text construct. Specifying the content type helps the synthesis processor to distinguish and interpret text constructs that may be rendered in different ways depending on what type of information is intended.
+      #
+      # When the value for the interpret-as attribute is unknown or unsupported by a processor, it must render the contained text as if no interpret-as value were specified.
+      #
+      # @return [String]
+      #
+      def interpret_as
+        read_attr :'interpret-as'
+      end
+      ##
+      # @param [String] ia
+      #
+      def interpret_as=(ia)
+        write_attr :'interpret-as', ia
+      end
+      ##
+      #
+      # Can give further hints on the precise formatting of the contained text for content types that may have ambiguous formats.
+      #
+      # When the value for the format attribute is unknown or unsupported by a processor, it must render the contained text as if no format value were specified, and should render it using the interpret-as value that is specified.
+      #
+      # @return [String]
+      #
+      def format
+        read_attr :format
+      end
+      ##
+      # @param [String] format
+      #
+      def format=(format)
+        write_attr :format, format
+      end
+      ##
+      #
+      # The detail attribute is an optional attribute that indicates the level of detail to be read aloud or rendered. Every value of the detail attribute must render all of the informational content in the contained text; however, specific values for the detail attribute can be used to render content that is not usually informational in running text but may be important to render for specific purposes. For example, a synthesis processor will usually render punctuations through appropriate changes in prosody. Setting a higher level of detail may be used to speak punctuations explicitly, e.g. for reading out coded part numbers or pieces of software code.
+      #
+      # The detail attribute can be used for all interpret-as types.
+      #
+      # If the detail attribute is not specified, the level of detail that is produced by the synthesis processor depends on the text content and the language.
+      #
+      # When the value for the detail attribute is unknown or unsupported by a processor, it must render the contained text as if no value were specified for the detail attribute.
+      #
+      # @return [String]
+      #
+      def detail
+        read_attr :detail
+      end
+      ##
+      # @param [String] detail
+      #
+      def detail=(detail)
+        write_attr :detail, detail
+      end
+      def <<(arg)
+        raise InvalidChildError, "A SayAs can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
+        super
+      end
+      def eql?(o)
+        super o, :interpret_as, :format, :detail
+      end
+    end # SayAs
+  end # SSML
+end # RubySpeech

data/lib/ruby_speech/ssml/speak.rb ADDED Viewed

@@ -0,0 +1,57 @@
+module RubySpeech
+  module SSML
+    ##
+    # The Speech Synthesis Markup Language is an XML application. The root element is speak.
+    #
+    # http://www.w3.org/TR/speech-synthesis/#S3.1.1
+    #
+    class Speak < Element
+      include XML::Language
+      VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
+      ##
+      # Create a new SSML speak root element
+      #
+      # @param [Hash] atts Key-value pairs of options mapping to setter methods
+      #
+      # @return [Speak] an element for use in an SSML document
+      #
+      def self.new(atts = {}, &block)
+        super('speak', atts) do
+          self[:version] = '1.0'
+          self.namespace = 'http://www.w3.org/2001/10/synthesis'
+          self.language ||= "en-US"
+          instance_eval &block if block_given?
+        end
+      end
+      ##
+      # @return [String] the base URI to which relative URLs are resolved
+      #
+      def base_uri
+        read_attr :base
+      end
+      ##
+      # @param [String] uri the base URI to which relative URLs are resolved
+      #
+      def base_uri=(uri)
+        write_attr 'xml:base', uri
+      end
+      def <<(arg)
+        raise InvalidChildError, "A Speak can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
+        super
+      end
+      def valid_child_type?(type)
+        VALID_CHILD_TYPES.include? type
+      end
+      def eql?(o)
+        super o, :language, :base_uri
+      end
+    end # Speak
+  end # SSML
+end # RubySpeech

data/lib/ruby_speech/ssml/voice.rb ADDED Viewed

@@ -0,0 +1,125 @@
+module RubySpeech
+  module SSML
+    ##
+    # The voice element is a production element that requests a change in speaking voice.
+    #
+    # http://www.w3.org/TR/speech-synthesis/#S3.2.1
+    #
+    class Voice < Element
+      include XML::Language
+      VALID_GENDERS = [:male, :female, :neutral].freeze
+      VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
+      ##
+      # Create a new SSML voice element
+      #
+      # @param [Hash] atts Key-value pairs of options mapping to setter methods
+      #
+      # @return [Voice] an element for use in an SSML document
+      #
+      def self.new(atts = {}, &block)
+        super 'voice', atts, &block
+      end
+      ##
+      # Indicates the preferred gender of the voice to speak the contained text. Enumerated values are: "male", "female", "neutral".
+      #
+      # @return [Symbol]
+      #
+      def gender
+        read_attr :gender, :to_sym
+      end
+      ##
+      # @param [Symbol] g the gender selected from VALID_GENDERS
+      #
+      # @raises ArgumentError if g is not one of VALID_GENDERS
+      #
+      def gender=(g)
+        raise ArgumentError, "You must specify a valid gender (#{VALID_GENDERS.map(&:inspect).join ', '})" unless VALID_GENDERS.include? g
+        write_attr :gender, g
+      end
+      ##
+      # Indicates the preferred age in years (since birth) of the voice to speak the contained text.
+      #
+      # @return [Integer]
+      #
+      def age
+        read_attr :age, :to_i
+      end
+      ##
+      # @param [Integer] i the age of the voice
+      #
+      # @raises ArgumentError if i is not a non-negative integer
+      #
+      def age=(i)
+        raise ArgumentError, "You must specify a valid age (non-negative integer)" unless i.is_a?(Integer) && i >= 0
+        write_attr :age, i
+      end
+      ##
+      # Indicates a preferred variant of the other voice characteristics to speak the contained text. (e.g. the second male child voice).
+      #
+      # @return [Integer]
+      #
+      def variant
+        read_attr :variant, :to_i
+      end
+      ##
+      # @param [Integer] i the variant of the voice
+      #
+      # @raises ArgumentError if i is not a non-negative integer
+      #
+      def variant=(i)
+        raise ArgumentError, "You must specify a valid variant (positive integer)" unless i.is_a?(Integer) && i > 0
+        write_attr :variant, i
+      end
+      ##
+      # A processor-specific voice name to speak the contained text.
+      #
+      # @return [String, Array, nil] the name or names of the voice
+      #
+      def name
+        names = read_attr :name
+        return unless names
+        names = names.split ' '
+        case names.count
+        when 0 then nil
+        when 1 then names.first
+        else names
+        end
+      end
+      ##
+      # @param [String, Array] the name or names of the voice. May be an array of names ordered from top preference down. The names must not contain any white space.
+      #
+      def name=(n)
+        # TODO: Raise ArgumentError if names contain whitespace
+        n = n.join(' ') if n.is_a? Array
+        write_attr :name, n
+      end
+      def valid_child_types
+        VALID_CHILD_TYPES
+      end
+      def <<(arg)
+        raise InvalidChildError, "A Voice can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
+        super
+      end
+      def valid_child_type?(type)
+        VALID_CHILD_TYPES.include? type
+      end
+      def eql?(o)
+        super o, :language, :gender, :age, :variant, :name
+      end
+    end # Voice
+  end # SSML
+end # RubySpeech