ruby_speech 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ module RubySpeech
2
+ module SSML
3
+ class Element < Niceogiri::XML::Node
4
+ def self.new(element_name, atts = {}, &block)
5
+ super element_name do |new_node|
6
+ atts.each_pair { |k, v| new_node.send :"#{k}=", v }
7
+ block_return = new_node.instance_eval &block if block_given?
8
+ new_node << block_return if block_return.is_a?(String)
9
+ end
10
+ end
11
+
12
+ def method_missing(method_name, *args, &block)
13
+ const = SSML.const_get(method_name.to_s.titleize.gsub(' ', ''))
14
+ if const && self.valid_child_type?(const)
15
+ self << const.new(*args, &block)
16
+ else
17
+ super
18
+ end
19
+ end
20
+
21
+ def eql?(o, *args)
22
+ super o, :content, *args
23
+ end
24
+ end # Element
25
+ end # SSML
26
+ end # RubySpeech
@@ -0,0 +1,53 @@
1
+ module RubySpeech
2
+ module SSML
3
+ ##
4
+ # The emphasis element requests that the contained text be spoken with emphasis (also referred to as prominence or stress). The synthesis processor determines how to render emphasis since the nature of emphasis differs between languages, dialects or even voices.
5
+ #
6
+ # http://www.w3.org/TR/speech-synthesis/#S3.2.2
7
+ #
8
+ class Emphasis < Element
9
+
10
+ VALID_LEVELS = [:strong, :moderate, :none, :reduced].freeze
11
+ VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
12
+
13
+ ##
14
+ # Create a new SSML emphasis element
15
+ #
16
+ # @param [Hash] atts Key-value pairs of options mapping to setter methods
17
+ #
18
+ # @return [Emphasis] an element for use in an SSML document
19
+ #
20
+ def self.new(atts = {}, &block)
21
+ super 'emphasis', atts, &block
22
+ end
23
+
24
+ ##
25
+ # Indicates the strength of emphasis to be applied. Defined values are "strong", "moderate", "none" and "reduced". The default level is "moderate". The meaning of "strong" and "moderate" emphasis is interpreted according to the language being spoken (languages indicate emphasis using a possible combination of pitch change, timing changes, loudness and other acoustic differences). The "reduced" level is effectively the opposite of emphasizing a word. For example, when the phrase "going to" is reduced it may be spoken as "gonna". The "none" level is used to prevent the synthesis processor from emphasizing words that it might typically emphasize. The values "none", "moderate", and "strong" are monotonically non-decreasing in strength.
26
+ #
27
+ # @return [Symbol]
28
+ #
29
+ def level
30
+ read_attr :level, :to_sym
31
+ end
32
+
33
+ ##
34
+ # @param [Symbol] l the level. Must be one of VALID_LEVELS
35
+ #
36
+ # @raises ArgumentError if l is not one of VALID_LEVELS
37
+ #
38
+ def level=(l)
39
+ raise ArgumentError, "You must specify a valid level (#{VALID_LEVELS.map(&:inspect).join ', '})" unless VALID_LEVELS.include? l
40
+ write_attr :level, l
41
+ end
42
+
43
+ def <<(arg)
44
+ raise InvalidChildError, "An Emphasis can only accept String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, SayAs, Sub, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
45
+ super
46
+ end
47
+
48
+ def eql?(o)
49
+ super o, :level
50
+ end
51
+ end # Emphasis
52
+ end # SSML
53
+ end # RubySpeech
@@ -0,0 +1,180 @@
1
+ module RubySpeech
2
+ module SSML
3
+ ##
4
+ # The prosody element permits control of the pitch, speaking rate and volume of the speech output.
5
+ #
6
+ # http://www.w3.org/TR/speech-synthesis/#S3.2.4
7
+ #
8
+ # Although each attribute individually is optional, it is an error if no attributes are specified when the prosody element is used. The "x-foo" attribute value names are intended to be mnemonics for "extra foo". Note also that customary pitch levels and standard pitch ranges may vary significantly by language, as may the meanings of the labelled values for pitch targets and ranges.
9
+ #
10
+ # The duration attribute takes precedence over the rate attribute. The contour attribute takes precedence over the pitch and range attributes.
11
+ #
12
+ # The default value of all prosodic attributes is no change. For example, omitting the rate attribute means that the rate is the same within the element as outside.
13
+ #
14
+ class Prosody < Element
15
+
16
+ VALID_PITCHES = [:'x-low', :low, :medium, :high, :'x-high', :default].freeze
17
+ VALID_VOLUMES = [:silent, :'x-soft', :soft, :medium, :loud, :'x-loud', :default].freeze
18
+ VALID_RATES = [:'x-slow', :slow, :medium, :fast, :'x-fast', :default].freeze
19
+ VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
20
+
21
+ ##
22
+ # Create a new SSML prosody element
23
+ #
24
+ # @param [Hash] atts Key-value pairs of options mapping to setter methods
25
+ #
26
+ # @return [Prosody] an element for use in an SSML document
27
+ #
28
+ def self.new(atts = {}, &block)
29
+ super 'prosody', atts, &block
30
+ end
31
+
32
+ ##
33
+ # The baseline pitch for the contained text. Although the exact meaning of "baseline pitch" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the approximate pitch of the output. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch levels.
34
+ #
35
+ # @return [Symbol, String]
36
+ #
37
+ def pitch
38
+ value = read_attr :pitch
39
+ return unless value
40
+ if value.include?('Hz')
41
+ value
42
+ elsif VALID_PITCHES.include?(value.to_sym)
43
+ value.to_sym
44
+ end
45
+ end
46
+
47
+ ##
48
+ # @param [Symbol, String] p
49
+ #
50
+ # @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
51
+ #
52
+ def pitch=(p)
53
+ hz = p.is_a?(String) && p.include?('Hz') && p.to_f > 0
54
+ raise ArgumentError, "You must specify a valid pitch (\"[positive-number]Hz\", #{VALID_PITCHES.map(&:inspect).join ', '})" unless hz || VALID_PITCHES.include?(p)
55
+ write_attr :pitch, p
56
+ end
57
+
58
+ ##
59
+ # The actual pitch contour for the contained text.
60
+ #
61
+ # The pitch contour is defined as a set of white space-separated targets at specified time positions in the speech output. The algorithm for interpolating between the targets is processor-specific. In each pair of the form (time position,target), the first value is a percentage of the period of the contained text (a number followed by "%") and the second value is the value of the pitch attribute (a number followed by "Hz", a relative change, or a label value). Time position values outside 0% to 100% are ignored. If a pitch value is not defined for 0% or 100% then the nearest pitch target is copied. All relative values for the pitch are relative to the pitch value just before the contained text.
62
+ #
63
+ # @return [Symbol]
64
+ #
65
+ def contour
66
+ read_attr :contour
67
+ end
68
+
69
+ ##
70
+ # @param [String] v
71
+ #
72
+ def contour=(v)
73
+ write_attr :contour, v
74
+ end
75
+
76
+ ##
77
+ # The pitch range (variability) for the contained text. Although the exact meaning of "pitch range" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the dynamic range of the output pitch. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch ranges.
78
+ #
79
+ # @return [Symbol]
80
+ #
81
+ def range
82
+ value = read_attr :range
83
+ return unless value
84
+ if value.include?('Hz')
85
+ value
86
+ elsif VALID_PITCHES.include?(value.to_sym)
87
+ value.to_sym
88
+ end
89
+ end
90
+
91
+ ##
92
+ # @param [Symbol, String] p
93
+ #
94
+ # @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
95
+ #
96
+ def range=(p)
97
+ hz = p.is_a?(String) && p.include?('Hz') && p.to_f > 0
98
+ raise ArgumentError, "You must specify a valid range (\"[positive-number]Hz\", #{VALID_PITCHES.map(&:inspect).join ', '})" unless hz || VALID_PITCHES.include?(p)
99
+ write_attr :range, p
100
+ end
101
+
102
+ ##
103
+ # A change in the speaking rate for the contained text. Legal values are: a relative change or "x-slow", "slow", "medium", "fast", "x-fast", or "default". Labels "x-slow" through "x-fast" represent a sequence of monotonically non-decreasing speaking rates. When a number is used to specify a relative change it acts as a multiplier of the default rate. For example, a value of 1 means no change in speaking rate, a value of 2 means a speaking rate twice the default rate, and a value of 0.5 means a speaking rate of half the default rate. The default rate for a voice depends on the language and dialect and on the personality of the voice. The default rate for a voice should be such that it is experienced as a normal speaking rate for the voice when reading aloud text. Since voices are processor-specific, the default rate will be as well.
104
+ #
105
+ # @return [Symbol, Float]
106
+ #
107
+ def rate
108
+ value = read_attr :rate
109
+ return unless value
110
+ if VALID_RATES.include?(value.to_sym)
111
+ value.to_sym
112
+ else
113
+ value.to_f
114
+ end
115
+ end
116
+
117
+ ##
118
+ # @param [Symbol, Numeric] v
119
+ #
120
+ # @raises ArgumentError if v is not either a positive Numeric or one of VALID_RATES
121
+ #
122
+ def rate=(v)
123
+ raise ArgumentError, "You must specify a valid rate ([positive-number](multiplier), #{VALID_RATES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && v >= 0) || VALID_RATES.include?(v)
124
+ write_attr :rate, v
125
+ end
126
+
127
+ ##
128
+ # A value in seconds for the desired time to take to read the element contents.
129
+ #
130
+ # @return [Integer]
131
+ #
132
+ def duration
133
+ read_attr :duration, :to_i
134
+ end
135
+
136
+ ##
137
+ # @param [Numeric] t
138
+ #
139
+ # @raises ArgumentError if t is not a positive numeric value
140
+ #
141
+ def duration=(t)
142
+ raise ArgumentError, "You must specify a valid duration (positive float value in seconds)" unless t.is_a?(Numeric) && t >= 0
143
+ write_attr :duration, "#{t}s"
144
+ end
145
+
146
+ ##
147
+ # The volume for the contained text in the range 0.0 to 100.0 (higher values are louder and specifying a value of zero is equivalent to specifying "silent"). Legal values are: number, a relative change or "silent", "x-soft", "soft", "medium", "loud", "x-loud", or "default". The volume scale is linear amplitude. The default is 100.0. Labels "silent" through "x-loud" represent a sequence of monotonically non-decreasing volume levels.
148
+ #
149
+ # @return [Symbol, Float]
150
+ #
151
+ def volume
152
+ value = read_attr :volume
153
+ if VALID_VOLUMES.include?(value.to_sym)
154
+ value.to_sym
155
+ else
156
+ value.to_f
157
+ end
158
+ end
159
+
160
+ ##
161
+ # @param [Numeric, Symbol] v
162
+ #
163
+ # @raises ArgumentError if v is not one of VALID_VOLUMES or a numeric value between 0.0 and 100.0
164
+ #
165
+ def volume=(v)
166
+ raise ArgumentError, "You must specify a valid volume ([positive-number](0.0 -> 100.0), #{VALID_VOLUMES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && (0..100).include?(v)) || VALID_VOLUMES.include?(v)
167
+ write_attr :volume, v
168
+ end
169
+
170
+ def <<(arg)
171
+ raise InvalidChildError, "A Prosody can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
172
+ super
173
+ end
174
+
175
+ def eql?(o)
176
+ super o, :pitch, :contour, :range, :rate, :duration, :volume
177
+ end
178
+ end # Prosody
179
+ end # SSML
180
+ end # RubySpeech
@@ -0,0 +1,109 @@
1
+ module RubySpeech
2
+ module SSML
3
+ ##
4
+ # The say-as element allows the author to indicate information on the type of text construct contained within the element and to help specify the level of detail for rendering the contained text.
5
+ #
6
+ # http://www.w3.org/TR/speech-synthesis/#S3.1.8
7
+ #
8
+ # Defining a comprehensive set of text format types is difficult because of the variety of languages that have to be considered and because of the innate flexibility of written languages. SSML only specifies the say-as element, its attributes, and their purpose. It does not enumerate the possible values for the attributes. The Working Group expects to produce a separate document that will define standard values and associated normative behavior for these values. Examples given here are only for illustrating the purpose of the element and the attributes.
9
+ #
10
+ # The say-as element has three attributes: interpret-as, format, and detail. The interpret-as attribute is always required; the other two attributes are optional. The legal values for the format attribute depend on the value of the interpret-as attribute.
11
+ #
12
+ # The say-as element can only contain text to be rendered.
13
+ #
14
+ # When specified, the interpret-as and format values are to be interpreted by the synthesis processor as hints provided by the markup document author to aid text normalization and pronunciation.
15
+ #
16
+ # In all cases, the text enclosed by any say-as element is intended to be a standard, orthographic form of the language currently in context. A synthesis processor should be able to support the common, orthographic forms of the specified language for every content type that it supports.
17
+ #
18
+ # When the content of the say-as element contains additional text next to the content that is in the indicated format and interpret-as type, then this additional text must be rendered. The processor may make the rendering of the additional text dependent on the interpret-as type of the element in which it appears.
19
+ # When the content of the say-as element contains no content in the indicated interpret-as type or format, the processor must render the content either as if the format attribute were not present, or as if the interpret-as attribute were not present, or as if neither the format nor interpret-as attributes were present. The processor should also notify the environment of the mismatch.
20
+ #
21
+ # Indicating the content type or format does not necessarily affect the way the information is pronounced. A synthesis processor should pronounce the contained text in a manner in which such content is normally produced for the language.
22
+ #
23
+ class SayAs < Element
24
+
25
+ VALID_CHILD_TYPES = [String].freeze
26
+
27
+ ##
28
+ # Create a new SSML say-as element
29
+ #
30
+ # @param [Hash] atts Key-value pairs of options mapping to setter methods
31
+ #
32
+ # @return [Prosody] an element for use in an SSML document
33
+ #
34
+ def self.new(interpret_as, atts = {}, &block)
35
+ super 'say-as', atts.merge(interpret_as: interpret_as), &block
36
+ end
37
+
38
+ ##
39
+ #
40
+ # The interpret-as attribute indicates the content type of the contained text construct. Specifying the content type helps the synthesis processor to distinguish and interpret text constructs that may be rendered in different ways depending on what type of information is intended.
41
+ #
42
+ # When the value for the interpret-as attribute is unknown or unsupported by a processor, it must render the contained text as if no interpret-as value were specified.
43
+ #
44
+ # @return [String]
45
+ #
46
+ def interpret_as
47
+ read_attr :'interpret-as'
48
+ end
49
+
50
+ ##
51
+ # @param [String] ia
52
+ #
53
+ def interpret_as=(ia)
54
+ write_attr :'interpret-as', ia
55
+ end
56
+
57
+ ##
58
+ #
59
+ # Can give further hints on the precise formatting of the contained text for content types that may have ambiguous formats.
60
+ #
61
+ # When the value for the format attribute is unknown or unsupported by a processor, it must render the contained text as if no format value were specified, and should render it using the interpret-as value that is specified.
62
+ #
63
+ # @return [String]
64
+ #
65
+ def format
66
+ read_attr :format
67
+ end
68
+
69
+ ##
70
+ # @param [String] format
71
+ #
72
+ def format=(format)
73
+ write_attr :format, format
74
+ end
75
+
76
+ ##
77
+ #
78
+ # The detail attribute is an optional attribute that indicates the level of detail to be read aloud or rendered. Every value of the detail attribute must render all of the informational content in the contained text; however, specific values for the detail attribute can be used to render content that is not usually informational in running text but may be important to render for specific purposes. For example, a synthesis processor will usually render punctuations through appropriate changes in prosody. Setting a higher level of detail may be used to speak punctuations explicitly, e.g. for reading out coded part numbers or pieces of software code.
79
+ #
80
+ # The detail attribute can be used for all interpret-as types.
81
+ #
82
+ # If the detail attribute is not specified, the level of detail that is produced by the synthesis processor depends on the text content and the language.
83
+ #
84
+ # When the value for the detail attribute is unknown or unsupported by a processor, it must render the contained text as if no value were specified for the detail attribute.
85
+ #
86
+ # @return [String]
87
+ #
88
+ def detail
89
+ read_attr :detail
90
+ end
91
+
92
+ ##
93
+ # @param [String] detail
94
+ #
95
+ def detail=(detail)
96
+ write_attr :detail, detail
97
+ end
98
+
99
+ def <<(arg)
100
+ raise InvalidChildError, "A SayAs can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
101
+ super
102
+ end
103
+
104
+ def eql?(o)
105
+ super o, :interpret_as, :format, :detail
106
+ end
107
+ end # SayAs
108
+ end # SSML
109
+ end # RubySpeech
@@ -0,0 +1,57 @@
1
+ module RubySpeech
2
+ module SSML
3
+ ##
4
+ # The Speech Synthesis Markup Language is an XML application. The root element is speak.
5
+ #
6
+ # http://www.w3.org/TR/speech-synthesis/#S3.1.1
7
+ #
8
+ class Speak < Element
9
+ include XML::Language
10
+
11
+ VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
12
+
13
+ ##
14
+ # Create a new SSML speak root element
15
+ #
16
+ # @param [Hash] atts Key-value pairs of options mapping to setter methods
17
+ #
18
+ # @return [Speak] an element for use in an SSML document
19
+ #
20
+ def self.new(atts = {}, &block)
21
+ super('speak', atts) do
22
+ self[:version] = '1.0'
23
+ self.namespace = 'http://www.w3.org/2001/10/synthesis'
24
+ self.language ||= "en-US"
25
+ instance_eval &block if block_given?
26
+ end
27
+ end
28
+
29
+ ##
30
+ # @return [String] the base URI to which relative URLs are resolved
31
+ #
32
+ def base_uri
33
+ read_attr :base
34
+ end
35
+
36
+ ##
37
+ # @param [String] uri the base URI to which relative URLs are resolved
38
+ #
39
+ def base_uri=(uri)
40
+ write_attr 'xml:base', uri
41
+ end
42
+
43
+ def <<(arg)
44
+ raise InvalidChildError, "A Speak can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
45
+ super
46
+ end
47
+
48
+ def valid_child_type?(type)
49
+ VALID_CHILD_TYPES.include? type
50
+ end
51
+
52
+ def eql?(o)
53
+ super o, :language, :base_uri
54
+ end
55
+ end # Speak
56
+ end # SSML
57
+ end # RubySpeech
@@ -0,0 +1,125 @@
1
+ module RubySpeech
2
+ module SSML
3
+ ##
4
+ # The voice element is a production element that requests a change in speaking voice.
5
+ #
6
+ # http://www.w3.org/TR/speech-synthesis/#S3.2.1
7
+ #
8
+ class Voice < Element
9
+ include XML::Language
10
+
11
+ VALID_GENDERS = [:male, :female, :neutral].freeze
12
+ VALID_CHILD_TYPES = [String, Break, Emphasis, Prosody, SayAs, Voice].freeze
13
+
14
+ ##
15
+ # Create a new SSML voice element
16
+ #
17
+ # @param [Hash] atts Key-value pairs of options mapping to setter methods
18
+ #
19
+ # @return [Voice] an element for use in an SSML document
20
+ #
21
+ def self.new(atts = {}, &block)
22
+ super 'voice', atts, &block
23
+ end
24
+
25
+ ##
26
+ # Indicates the preferred gender of the voice to speak the contained text. Enumerated values are: "male", "female", "neutral".
27
+ #
28
+ # @return [Symbol]
29
+ #
30
+ def gender
31
+ read_attr :gender, :to_sym
32
+ end
33
+
34
+ ##
35
+ # @param [Symbol] g the gender selected from VALID_GENDERS
36
+ #
37
+ # @raises ArgumentError if g is not one of VALID_GENDERS
38
+ #
39
+ def gender=(g)
40
+ raise ArgumentError, "You must specify a valid gender (#{VALID_GENDERS.map(&:inspect).join ', '})" unless VALID_GENDERS.include? g
41
+ write_attr :gender, g
42
+ end
43
+
44
+ ##
45
+ # Indicates the preferred age in years (since birth) of the voice to speak the contained text.
46
+ #
47
+ # @return [Integer]
48
+ #
49
+ def age
50
+ read_attr :age, :to_i
51
+ end
52
+
53
+ ##
54
+ # @param [Integer] i the age of the voice
55
+ #
56
+ # @raises ArgumentError if i is not a non-negative integer
57
+ #
58
+ def age=(i)
59
+ raise ArgumentError, "You must specify a valid age (non-negative integer)" unless i.is_a?(Integer) && i >= 0
60
+ write_attr :age, i
61
+ end
62
+
63
+ ##
64
+ # Indicates a preferred variant of the other voice characteristics to speak the contained text. (e.g. the second male child voice).
65
+ #
66
+ # @return [Integer]
67
+ #
68
+ def variant
69
+ read_attr :variant, :to_i
70
+ end
71
+
72
+ ##
73
+ # @param [Integer] i the variant of the voice
74
+ #
75
+ # @raises ArgumentError if i is not a non-negative integer
76
+ #
77
+ def variant=(i)
78
+ raise ArgumentError, "You must specify a valid variant (positive integer)" unless i.is_a?(Integer) && i > 0
79
+ write_attr :variant, i
80
+ end
81
+
82
+ ##
83
+ # A processor-specific voice name to speak the contained text.
84
+ #
85
+ # @return [String, Array, nil] the name or names of the voice
86
+ #
87
+ def name
88
+ names = read_attr :name
89
+ return unless names
90
+ names = names.split ' '
91
+ case names.count
92
+ when 0 then nil
93
+ when 1 then names.first
94
+ else names
95
+ end
96
+ end
97
+
98
+ ##
99
+ # @param [String, Array] the name or names of the voice. May be an array of names ordered from top preference down. The names must not contain any white space.
100
+ #
101
+ def name=(n)
102
+ # TODO: Raise ArgumentError if names contain whitespace
103
+ n = n.join(' ') if n.is_a? Array
104
+ write_attr :name, n
105
+ end
106
+
107
+ def valid_child_types
108
+ VALID_CHILD_TYPES
109
+ end
110
+
111
+ def <<(arg)
112
+ raise InvalidChildError, "A Voice can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
113
+ super
114
+ end
115
+
116
+ def valid_child_type?(type)
117
+ VALID_CHILD_TYPES.include? type
118
+ end
119
+
120
+ def eql?(o)
121
+ super o, :language, :gender, :age, :variant, :name
122
+ end
123
+ end # Voice
124
+ end # SSML
125
+ end # RubySpeech