ruby_speech 2.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +12 -0
- data/.rspec +3 -0
- data/.travis.yml +17 -0
- data/CHANGELOG.md +144 -0
- data/Gemfile +3 -0
- data/Guardfile +9 -0
- data/LICENSE.md +20 -0
- data/README.md +314 -0
- data/Rakefile +34 -0
- data/assets/grammar-core.xsd +317 -0
- data/assets/grammar.xsd +37 -0
- data/assets/synthesis-core.xsd +445 -0
- data/assets/synthesis.xsd +63 -0
- data/assets/xml.xsd +287 -0
- data/ext/ruby_speech/RubySpeechGRXMLMatcher.java +64 -0
- data/ext/ruby_speech/RubySpeechService.java +23 -0
- data/ext/ruby_speech/extconf.rb +7 -0
- data/ext/ruby_speech/ruby_speech.c +97 -0
- data/lib/ruby_speech/generic_element.rb +169 -0
- data/lib/ruby_speech/grxml/element.rb +29 -0
- data/lib/ruby_speech/grxml/grammar.rb +189 -0
- data/lib/ruby_speech/grxml/item.rb +144 -0
- data/lib/ruby_speech/grxml/match.rb +16 -0
- data/lib/ruby_speech/grxml/matcher.rb +126 -0
- data/lib/ruby_speech/grxml/max_match.rb +6 -0
- data/lib/ruby_speech/grxml/no_match.rb +10 -0
- data/lib/ruby_speech/grxml/one_of.rb +31 -0
- data/lib/ruby_speech/grxml/potential_match.rb +10 -0
- data/lib/ruby_speech/grxml/rule.rb +73 -0
- data/lib/ruby_speech/grxml/ruleref.rb +69 -0
- data/lib/ruby_speech/grxml/tag.rb +29 -0
- data/lib/ruby_speech/grxml/token.rb +31 -0
- data/lib/ruby_speech/grxml.rb +39 -0
- data/lib/ruby_speech/nlsml/builder.rb +34 -0
- data/lib/ruby_speech/nlsml/document.rb +120 -0
- data/lib/ruby_speech/nlsml.rb +18 -0
- data/lib/ruby_speech/ruby_speech.jar +0 -0
- data/lib/ruby_speech/ssml/audio.rb +47 -0
- data/lib/ruby_speech/ssml/break.rb +62 -0
- data/lib/ruby_speech/ssml/desc.rb +24 -0
- data/lib/ruby_speech/ssml/element.rb +23 -0
- data/lib/ruby_speech/ssml/emphasis.rb +44 -0
- data/lib/ruby_speech/ssml/mark.rb +43 -0
- data/lib/ruby_speech/ssml/p.rb +25 -0
- data/lib/ruby_speech/ssml/phoneme.rb +72 -0
- data/lib/ruby_speech/ssml/prosody.rb +172 -0
- data/lib/ruby_speech/ssml/s.rb +25 -0
- data/lib/ruby_speech/ssml/say_as.rb +100 -0
- data/lib/ruby_speech/ssml/speak.rb +27 -0
- data/lib/ruby_speech/ssml/sub.rb +42 -0
- data/lib/ruby_speech/ssml/voice.rb +108 -0
- data/lib/ruby_speech/ssml.rb +39 -0
- data/lib/ruby_speech/version.rb +3 -0
- data/lib/ruby_speech/xml/language.rb +13 -0
- data/lib/ruby_speech/xml.rb +11 -0
- data/lib/ruby_speech.rb +36 -0
- data/ruby_speech.gemspec +42 -0
- data/spec/ruby_speech/grxml/grammar_spec.rb +341 -0
- data/spec/ruby_speech/grxml/item_spec.rb +192 -0
- data/spec/ruby_speech/grxml/match_spec.rb +15 -0
- data/spec/ruby_speech/grxml/matcher_spec.rb +688 -0
- data/spec/ruby_speech/grxml/max_match_spec.rb +17 -0
- data/spec/ruby_speech/grxml/no_match_spec.rb +17 -0
- data/spec/ruby_speech/grxml/one_of_spec.rb +49 -0
- data/spec/ruby_speech/grxml/potential_match_spec.rb +17 -0
- data/spec/ruby_speech/grxml/rule_spec.rb +125 -0
- data/spec/ruby_speech/grxml/ruleref_spec.rb +55 -0
- data/spec/ruby_speech/grxml/tag_spec.rb +41 -0
- data/spec/ruby_speech/grxml/token_spec.rb +62 -0
- data/spec/ruby_speech/grxml_spec.rb +339 -0
- data/spec/ruby_speech/nlsml_spec.rb +353 -0
- data/spec/ruby_speech/ssml/audio_spec.rb +121 -0
- data/spec/ruby_speech/ssml/break_spec.rb +100 -0
- data/spec/ruby_speech/ssml/desc_spec.rb +57 -0
- data/spec/ruby_speech/ssml/emphasis_spec.rb +110 -0
- data/spec/ruby_speech/ssml/mark_spec.rb +53 -0
- data/spec/ruby_speech/ssml/p_spec.rb +96 -0
- data/spec/ruby_speech/ssml/phoneme_spec.rb +65 -0
- data/spec/ruby_speech/ssml/prosody_spec.rb +309 -0
- data/spec/ruby_speech/ssml/s_spec.rb +92 -0
- data/spec/ruby_speech/ssml/say_as_spec.rb +71 -0
- data/spec/ruby_speech/ssml/speak_spec.rb +166 -0
- data/spec/ruby_speech/ssml/sub_spec.rb +57 -0
- data/spec/ruby_speech/ssml/voice_spec.rb +200 -0
- data/spec/ruby_speech/ssml_spec.rb +285 -0
- data/spec/ruby_speech_spec.rb +124 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/match_examples.rb +43 -0
- data/spec/support/matchers.rb +46 -0
- metadata +405 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module GRXML
|
|
3
|
+
##
|
|
4
|
+
#
|
|
5
|
+
# The tag element is one of the valid expansion elements for the SGR rule element
|
|
6
|
+
#
|
|
7
|
+
# http://www.w3.org/TR/speech-grammar/#S2.6
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
# TODO: Make sure this is complete...
|
|
11
|
+
#
|
|
12
|
+
#
|
|
13
|
+
class Tag < Element
|
|
14
|
+
|
|
15
|
+
register :tag
|
|
16
|
+
|
|
17
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String].freeze
|
|
18
|
+
|
|
19
|
+
def <<(arg)
|
|
20
|
+
raise InvalidChildError, "A Tag can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
21
|
+
super
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def regexp_content # :nodoc:
|
|
25
|
+
"?<#{content}>"
|
|
26
|
+
end
|
|
27
|
+
end # Tag
|
|
28
|
+
end # GRXML
|
|
29
|
+
end # RubySpeech
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module GRXML
|
|
3
|
+
##
|
|
4
|
+
#
|
|
5
|
+
# A token (a.k.a. a terminal symbol) is the part of a grammar that defines words or other entities that may be spoken. Any legal token is a legal expansion.
|
|
6
|
+
#
|
|
7
|
+
# http://www.w3.org/TR/speech-grammar/#S2.1
|
|
8
|
+
#
|
|
9
|
+
# The token element may include an optional xml:lang attribute to indicate the language of the contained token.
|
|
10
|
+
#
|
|
11
|
+
class Token < Element
|
|
12
|
+
|
|
13
|
+
register :token
|
|
14
|
+
|
|
15
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String].freeze
|
|
16
|
+
|
|
17
|
+
def <<(arg)
|
|
18
|
+
raise InvalidChildError, "A Token can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
19
|
+
super
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def normalize_whitespace
|
|
23
|
+
self.content = content.strip.squeeze ' '
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def regexp_content # :nodoc:
|
|
27
|
+
Regexp.escape content
|
|
28
|
+
end
|
|
29
|
+
end # Token
|
|
30
|
+
end # GRXML
|
|
31
|
+
end # RubySpeech
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module GRXML
|
|
3
|
+
extend ActiveSupport::Autoload
|
|
4
|
+
|
|
5
|
+
eager_autoload do
|
|
6
|
+
autoload :Element
|
|
7
|
+
autoload :Grammar
|
|
8
|
+
autoload :Rule
|
|
9
|
+
autoload :Item
|
|
10
|
+
autoload :OneOf
|
|
11
|
+
autoload :Ruleref
|
|
12
|
+
autoload :Tag
|
|
13
|
+
autoload :Token
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
autoload :Match
|
|
17
|
+
autoload :Matcher
|
|
18
|
+
autoload :MaxMatch
|
|
19
|
+
autoload :NoMatch
|
|
20
|
+
autoload :PotentialMatch
|
|
21
|
+
|
|
22
|
+
InvalidChildError = Class.new StandardError
|
|
23
|
+
|
|
24
|
+
GRXML_NAMESPACE = 'http://www.w3.org/2001/06/grammar'
|
|
25
|
+
|
|
26
|
+
def self.draw(attributes = {}, &block)
|
|
27
|
+
Grammar.new(attributes).tap do |grammar|
|
|
28
|
+
block_return = grammar.eval_dsl_block &block
|
|
29
|
+
grammar << block_return if block_return.is_a?(String)
|
|
30
|
+
end.assert_has_matching_root_rule
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.import(other)
|
|
34
|
+
Element.import other
|
|
35
|
+
end
|
|
36
|
+
end # GRXML
|
|
37
|
+
end # RubySpeech
|
|
38
|
+
|
|
39
|
+
ActiveSupport::Autoload.eager_autoload!
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module NLSML
|
|
3
|
+
class Builder
|
|
4
|
+
attr_reader :document
|
|
5
|
+
|
|
6
|
+
def initialize(options = {}, &block)
|
|
7
|
+
options = {'xmlns' => NLSML_NAMESPACE}.merge(options)
|
|
8
|
+
@document = Nokogiri::XML::Builder.new do |builder|
|
|
9
|
+
builder.result options do |r|
|
|
10
|
+
apply_block r, &block
|
|
11
|
+
end
|
|
12
|
+
end.doc
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def interpretation(*args, &block)
|
|
16
|
+
if args.last.respond_to?(:has_key?) && args.last.has_key?(:confidence)
|
|
17
|
+
args.last[:confidence] = args.last[:confidence].to_f
|
|
18
|
+
end
|
|
19
|
+
@result.send :interpretation, *args, &block
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def method_missing(method_name, *args, &block)
|
|
23
|
+
@result.send method_name, *args, &block
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def apply_block(result, &block)
|
|
29
|
+
@result = result
|
|
30
|
+
instance_eval &block
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
require 'delegate'
|
|
2
|
+
|
|
3
|
+
module RubySpeech
|
|
4
|
+
module NLSML
|
|
5
|
+
class Document < SimpleDelegator
|
|
6
|
+
def initialize(xml)
|
|
7
|
+
unless xml.root.namespace
|
|
8
|
+
xml.root.default_namespace = NLSML_NAMESPACE
|
|
9
|
+
xml = Nokogiri::XML.parse xml.to_xml, nil, nil, Nokogiri::XML::ParseOptions::NOBLANKS
|
|
10
|
+
end
|
|
11
|
+
super
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def grammar
|
|
15
|
+
result['grammar']
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def interpretations
|
|
19
|
+
interpretation_nodes.map do |interpretation|
|
|
20
|
+
interpretation_hash_for_interpretation interpretation
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def best_interpretation
|
|
25
|
+
interpretation_hash_for_interpretation interpretation_nodes.first
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def match?
|
|
29
|
+
interpretation_nodes.count > 0 && !nomatch? && !noinput?
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def ==(other)
|
|
33
|
+
to_xml == other.to_xml
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def noinput?
|
|
37
|
+
noinput_elements.any?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def nomatch?
|
|
43
|
+
nomatch_elements.count >= input_elements.count
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def nomatch_elements
|
|
47
|
+
result.xpath 'ns:interpretation/ns:input/ns:nomatch', 'ns' => NLSML_NAMESPACE
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def noinput_elements
|
|
51
|
+
result.xpath 'ns:interpretation/ns:input/ns:noinput', 'ns' => NLSML_NAMESPACE
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def input_elements
|
|
55
|
+
result.xpath 'ns:interpretation/ns:input', 'ns' => NLSML_NAMESPACE
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def input_hash_for_interpretation(interpretation)
|
|
59
|
+
input_element = interpretation.at_xpath 'ns:input', 'ns' => NLSML_NAMESPACE
|
|
60
|
+
{ content: input_element.content }.tap do |h|
|
|
61
|
+
h[:mode] = input_element['mode'].to_sym if input_element['mode']
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def instance_hash_for_interpretation(interpretation)
|
|
66
|
+
instances = instance_elements interpretation
|
|
67
|
+
return unless instances.any?
|
|
68
|
+
element_children_key_value instances.first
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def instances_collection_for_interpretation(interpretation)
|
|
72
|
+
instances = instance_elements interpretation
|
|
73
|
+
instances.map do |instance|
|
|
74
|
+
element_children_key_value instance
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def instance_elements(interpretation)
|
|
79
|
+
interpretation.xpath 'ns:instance', 'ns' => NLSML_NAMESPACE
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def element_children_key_value(element)
|
|
83
|
+
return element.children.first.content if element.children.first.is_a?(Nokogiri::XML::Text)
|
|
84
|
+
element.children.inject({}) do |acc, child|
|
|
85
|
+
acc[child.node_name.to_sym] = case child.children.count
|
|
86
|
+
when 0
|
|
87
|
+
child.content
|
|
88
|
+
when 1
|
|
89
|
+
if child.children.first.is_a?(Nokogiri::XML::Text)
|
|
90
|
+
child.children.first.content
|
|
91
|
+
else
|
|
92
|
+
element_children_key_value child
|
|
93
|
+
end
|
|
94
|
+
else
|
|
95
|
+
element_children_key_value child
|
|
96
|
+
end
|
|
97
|
+
acc
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def interpretation_hash_for_interpretation(interpretation)
|
|
102
|
+
{
|
|
103
|
+
confidence: interpretation['confidence'].to_f,
|
|
104
|
+
input: input_hash_for_interpretation(interpretation),
|
|
105
|
+
instance: instance_hash_for_interpretation(interpretation),
|
|
106
|
+
instances: instances_collection_for_interpretation(interpretation)
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def result
|
|
111
|
+
root
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def interpretation_nodes
|
|
115
|
+
nodes = result.xpath 'ns:interpretation', 'ns' => NLSML_NAMESPACE
|
|
116
|
+
nodes.sort_by { |int| -int[:confidence].to_f }
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module NLSML
|
|
3
|
+
extend ActiveSupport::Autoload
|
|
4
|
+
|
|
5
|
+
NLSML_NAMESPACE = 'http://www.ietf.org/xml/ns/mrcpv2'
|
|
6
|
+
|
|
7
|
+
eager_autoload do
|
|
8
|
+
autoload :Builder
|
|
9
|
+
autoload :Document
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.draw(options = {}, &block)
|
|
13
|
+
Builder.new(options, &block).document
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
ActiveSupport::Autoload.eager_autoload!
|
|
Binary file
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# The audio element supports the insertion of recorded audio files (see Appendix A for required formats) and the insertion of other audio formats in conjunction with synthesized speech output. The audio element may be empty. If the audio element is not empty then the contents should be the marked-up text to be spoken if the audio document is not available. The alternate content may include text, speech markup, desc elements, or other audio elements. The alternate content may also be used when rendering the document to non-audible output and for accessibility (see the desc element). The required attribute is src, which is the URI of a document with an appropriate MIME type.
|
|
5
|
+
#
|
|
6
|
+
# An audio element is successfully rendered:
|
|
7
|
+
# * If the referenced audio source is played, or
|
|
8
|
+
# * If the synthesis processor is unable to execute #1 but the alternative content is successfully rendered, or
|
|
9
|
+
# * If the processor can detect that text-only output is required and the alternative content is successfully rendered.
|
|
10
|
+
#
|
|
11
|
+
# Deciding which conditions result in the alternative content being rendered is processor-dependent. If the audio element is not successfully rendered, a synthesis processor should continue processing and should notify the hosting environment. The processor may determine after beginning playback of an audio source that the audio cannot be played in its entirety. For example, encoding problems, network disruptions, etc. may occur. The processor may designate this either as successful or unsuccessful rendering, but it must document this behavior.
|
|
12
|
+
#
|
|
13
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.3.1
|
|
14
|
+
#
|
|
15
|
+
class Audio < Element
|
|
16
|
+
|
|
17
|
+
register :audio
|
|
18
|
+
|
|
19
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String, Audio, Break, Desc, Emphasis, Mark, P, Phoneme, Prosody, S, SayAs, Sub, Voice].freeze
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# The URI of a document with an appropriate MIME type
|
|
23
|
+
#
|
|
24
|
+
# @return [String]
|
|
25
|
+
#
|
|
26
|
+
def src
|
|
27
|
+
read_attr :src
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# @param [String] the source. Must be a valid URI
|
|
32
|
+
#
|
|
33
|
+
def src=(s)
|
|
34
|
+
self[:src] = s
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def <<(arg)
|
|
38
|
+
raise InvalidChildError, "An Audio can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
39
|
+
super
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def eql?(o)
|
|
43
|
+
super o, :src
|
|
44
|
+
end
|
|
45
|
+
end # Audio
|
|
46
|
+
end # SSML
|
|
47
|
+
end # RubySpeech
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# The break element is an empty element that controls the pausing or other prosodic boundaries between words. The use of the break element between any pair of words is optional. If the element is not present between words, the synthesis processor is expected to automatically determine a break based on the linguistic context. In practice, the break element is most often used to override the typical automatic behavior of a synthesis processor.
|
|
5
|
+
#
|
|
6
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.2.3
|
|
7
|
+
#
|
|
8
|
+
class Break < Element
|
|
9
|
+
|
|
10
|
+
register :break
|
|
11
|
+
|
|
12
|
+
VALID_STRENGTHS = [:none, :'x-weak', :weak, :medium, :strong, :'x-strong'].freeze
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
# This attribute is used to indicate the strength of the prosodic break in the speech output. The value "none" indicates that no prosodic break boundary should be outputted, which can be used to prevent a prosodic break which the processor would otherwise produce. The other values indicate monotonically non-decreasing (conceptually increasing) break strength between words. The stronger boundaries are typically accompanied by pauses. "x-weak" and "x-strong" are mnemonics for "extra weak" and "extra strong", respectively.
|
|
16
|
+
#
|
|
17
|
+
# @return [Symbol]
|
|
18
|
+
#
|
|
19
|
+
def strength
|
|
20
|
+
read_attr :strength, :to_sym
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# @param [Symbol] the strength. Must be one of VALID_STRENGTHS
|
|
25
|
+
#
|
|
26
|
+
# @raises ArgumentError if s is not one of VALID_STRENGTHS
|
|
27
|
+
#
|
|
28
|
+
def strength=(s)
|
|
29
|
+
raise ArgumentError, "You must specify a valid strength (#{VALID_STRENGTHS.map(&:inspect).join ', '})" unless VALID_STRENGTHS.include? s
|
|
30
|
+
self[:strength] = s
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
##
|
|
34
|
+
# Indicates the duration of a pause to be inserted in the output in seconds or milliseconds. It follows the time value format from the Cascading Style Sheets Level 2 Recommendation [CSS2], e.g. "250ms", "3s".
|
|
35
|
+
#
|
|
36
|
+
# @return [Float]
|
|
37
|
+
#
|
|
38
|
+
def time
|
|
39
|
+
read_attr :time, :to_f
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# @param [Numeric] t the time as a positive value in seconds
|
|
44
|
+
#
|
|
45
|
+
# @raises ArgumentError if t is nota positive numeric value
|
|
46
|
+
#
|
|
47
|
+
def time=(t)
|
|
48
|
+
raise ArgumentError, "You must specify a valid time (positive float value in seconds)" unless t.is_a?(Numeric) && t >= 0
|
|
49
|
+
self[:time] = "#{t}s"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def <<(*args)
|
|
53
|
+
raise InvalidChildError, "A Break cannot contain children"
|
|
54
|
+
super
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def eql?(o)
|
|
58
|
+
super o, :strength, :time
|
|
59
|
+
end
|
|
60
|
+
end # Break
|
|
61
|
+
end # SSML
|
|
62
|
+
end # RubySpeech
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# The emphasis element requests that the contained text be spoken with emphasis (also referred to as prominence or stress). The synthesis processor determines how to render emphasis since the nature of emphasis differs between languages, dialects or even voices.
|
|
5
|
+
#
|
|
6
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.2.2
|
|
7
|
+
#
|
|
8
|
+
class Desc < Element
|
|
9
|
+
|
|
10
|
+
register :desc
|
|
11
|
+
|
|
12
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Text, String].freeze
|
|
13
|
+
|
|
14
|
+
def <<(arg)
|
|
15
|
+
raise InvalidChildError, "A Desc can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
16
|
+
super
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def eql?(o)
|
|
20
|
+
super o, :language
|
|
21
|
+
end
|
|
22
|
+
end # Desc
|
|
23
|
+
end # SSML
|
|
24
|
+
end # RubySpeech
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'active_support/core_ext/class/attribute'
|
|
2
|
+
|
|
3
|
+
module RubySpeech
|
|
4
|
+
module SSML
|
|
5
|
+
class Element < Niceogiri::XML::Node
|
|
6
|
+
def self.namespace
|
|
7
|
+
SSML_NAMESPACE
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.root_element
|
|
11
|
+
Speak
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.module
|
|
15
|
+
SSML
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
include GenericElement
|
|
19
|
+
|
|
20
|
+
alias :to_doc :document
|
|
21
|
+
end # Element
|
|
22
|
+
end # SSML
|
|
23
|
+
end # RubySpeech
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# The emphasis element requests that the contained text be spoken with emphasis (also referred to as prominence or stress). The synthesis processor determines how to render emphasis since the nature of emphasis differs between languages, dialects or even voices.
|
|
5
|
+
#
|
|
6
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.2.2
|
|
7
|
+
#
|
|
8
|
+
class Emphasis < Element
|
|
9
|
+
|
|
10
|
+
register :emphasis
|
|
11
|
+
|
|
12
|
+
VALID_LEVELS = [:strong, :moderate, :none, :reduced].freeze
|
|
13
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, SayAs, Sub, Voice].freeze
|
|
14
|
+
|
|
15
|
+
##
|
|
16
|
+
# Indicates the strength of emphasis to be applied. Defined values are "strong", "moderate", "none" and "reduced". The default level is "moderate". The meaning of "strong" and "moderate" emphasis is interpreted according to the language being spoken (languages indicate emphasis using a possible combination of pitch change, timing changes, loudness and other acoustic differences). The "reduced" level is effectively the opposite of emphasizing a word. For example, when the phrase "going to" is reduced it may be spoken as "gonna". The "none" level is used to prevent the synthesis processor from emphasizing words that it might typically emphasize. The values "none", "moderate", and "strong" are monotonically non-decreasing in strength.
|
|
17
|
+
#
|
|
18
|
+
# @return [Symbol]
|
|
19
|
+
#
|
|
20
|
+
def level
|
|
21
|
+
read_attr :level, :to_sym
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# @param [Symbol] l the level. Must be one of VALID_LEVELS
|
|
26
|
+
#
|
|
27
|
+
# @raises ArgumentError if l is not one of VALID_LEVELS
|
|
28
|
+
#
|
|
29
|
+
def level=(l)
|
|
30
|
+
raise ArgumentError, "You must specify a valid level (#{VALID_LEVELS.map(&:inspect).join ', '})" unless VALID_LEVELS.include? l
|
|
31
|
+
self[:level] = l
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def <<(arg)
|
|
35
|
+
raise InvalidChildError, "An Emphasis can only accept String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, SayAs, Sub, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
36
|
+
super
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def eql?(o)
|
|
40
|
+
super o, :level
|
|
41
|
+
end
|
|
42
|
+
end # Emphasis
|
|
43
|
+
end # SSML
|
|
44
|
+
end # RubySpeech
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# A mark element is an empty element that places a marker into the text/tag sequence. It has one required attribute, name, which is of type xsd:token [SCHEMA2 §3.3.2]. The mark element can be used to reference a specific location in the text/tag sequence, and can additionally be used to insert a marker into an output stream for asynchronous notification. When processing a mark element, a synthesis processor must do one or both of the following:
|
|
5
|
+
#
|
|
6
|
+
# * inform the hosting environment with the value of the name attribute and with information allowing the platform to retrieve the corresponding position in the rendered output.
|
|
7
|
+
# * when audio output of the SSML document reaches the mark, issue an event that includes the required name attribute of the element. The hosting environment defines the destination of the event.
|
|
8
|
+
#
|
|
9
|
+
# The mark element does not affect the speech output process.
|
|
10
|
+
#
|
|
11
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.3.2
|
|
12
|
+
#
|
|
13
|
+
class Mark < Element
|
|
14
|
+
|
|
15
|
+
register :mark
|
|
16
|
+
|
|
17
|
+
##
|
|
18
|
+
# This attribute is a token by which to reference the mark
|
|
19
|
+
#
|
|
20
|
+
# @return [String]
|
|
21
|
+
#
|
|
22
|
+
def name
|
|
23
|
+
read_attr :name
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
##
|
|
27
|
+
# @param [String] the name token
|
|
28
|
+
#
|
|
29
|
+
def name=(other)
|
|
30
|
+
self[:name] = other
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def <<(*args)
|
|
34
|
+
raise InvalidChildError, "A Mark cannot contain children"
|
|
35
|
+
super
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def eql?(o)
|
|
39
|
+
super o, :name
|
|
40
|
+
end
|
|
41
|
+
end # Mark
|
|
42
|
+
end # SSML
|
|
43
|
+
end # RubySpeech
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# A p element represents a paragraph.
|
|
5
|
+
# The use of p elements is optional. Where text occurs without an enclosing p element the synthesis processor should attempt to determine the structure using language-specific knowledge of the format of plain text.
|
|
6
|
+
#
|
|
7
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.1.7
|
|
8
|
+
#
|
|
9
|
+
class P < Element
|
|
10
|
+
|
|
11
|
+
register :p
|
|
12
|
+
|
|
13
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, S, SayAs, Sub, Voice].freeze
|
|
14
|
+
|
|
15
|
+
def <<(arg)
|
|
16
|
+
raise InvalidChildError, "A P can only accept String, Audio, Break, Emphasis, Mark, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
17
|
+
super
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def eql?(o)
|
|
21
|
+
super o, :language
|
|
22
|
+
end
|
|
23
|
+
end # P
|
|
24
|
+
end # SSML
|
|
25
|
+
end # RubySpeech
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
module RubySpeech
|
|
2
|
+
module SSML
|
|
3
|
+
##
|
|
4
|
+
# The phoneme element provides a phonemic/phonetic pronunciation for the contained text. The phoneme element may be empty. However, it is recommended that the element contain human-readable text that can be used for non-spoken rendering of the document. For example, the content may be displayed visually for users with hearing impairments.
|
|
5
|
+
#
|
|
6
|
+
# The ph attribute is a required attribute that specifies the phoneme/phone string.
|
|
7
|
+
#
|
|
8
|
+
# This element is designed strictly for phonemic and phonetic notations and is intended to be used to provide pronunciations for words or very short phrases. The phonemic/phonetic string does not undergo text normalization and is not treated as a token for lookup in the lexicon (see Section 3.1.4), while values in say-as and sub may undergo both. Briefly, phonemic strings consist of phonemes, language-dependent speech units that characterize linguistically significant differences in the language; loosely, phonemes represent all the sounds needed to distinguish one word from another in a given language. On the other hand, phonetic strings consist of phones, speech units that characterize the manner (puff of air, click, vocalized, etc.) and place (front, middle, back, etc.) of articulation within the human vocal tract and are thus independent of language; phones represent realized distinctions in human speech production.
|
|
9
|
+
#
|
|
10
|
+
# The alphabet attribute is an optional attribute that specifies the phonemic/phonetic alphabet. An alphabet in this context refers to a collection of symbols to represent the sounds of one or more human languages. The only valid values for this attribute are "ipa" (see the next paragraph) and vendor-defined strings of the form "x-organization" or "x-organization-alphabet". For example, the Japan Electronics and Information Technology Industries Association [JEITA] might wish to encourage the use of an alphabet such as "x-JEITA" or "x-JEITA-2000" for their phoneme alphabet [JEIDAALPHABET].
|
|
11
|
+
#
|
|
12
|
+
# Synthesis processors should support a value for alphabet of "ipa", corresponding to Unicode representations of the phonetic characters developed by the International Phonetic Association [IPA]. In addition to an exhaustive set of vowel and consonant symbols, this character set supports a syllable delimiter, numerous diacritics, stress symbols, lexical tone symbols, intonational markers and more. For this alphabet, legal ph values are strings of the values specified in Appendix 2 of [IPAHNDBK]. Informative tables of the IPA-to-Unicode mappings can be found at [IPAUNICODE1] and [IPAUNICODE2]. Note that not all of the IPA characters are available in Unicode. For processors supporting this alphabet,
|
|
13
|
+
#
|
|
14
|
+
# * The processor must syntactically accept all legal ph values.
|
|
15
|
+
# * The processor should produce output when given Unicode IPA codes that can reasonably be considered to belong to the current language.
|
|
16
|
+
# * The production of output when given other codes is entirely at processor discretion.
|
|
17
|
+
#
|
|
18
|
+
# It is an error if a value for alphabet is specified that is not known or cannot be applied by a synthesis processor. The default behavior when the alphabet attribute is left unspecified is processor-specific.
|
|
19
|
+
#
|
|
20
|
+
# The phoneme element itself can only contain text (no elements).
|
|
21
|
+
#
|
|
22
|
+
# http://www.w3.org/TR/speech-synthesis/#S3.1.9
|
|
23
|
+
#
|
|
24
|
+
class Phoneme < Element
|
|
25
|
+
|
|
26
|
+
register :phoneme
|
|
27
|
+
|
|
28
|
+
VALID_CHILD_TYPES = [Nokogiri::XML::Text, String].freeze
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Specifies the phonemic/phonetic alphabet
|
|
32
|
+
#
|
|
33
|
+
# @return [String]
|
|
34
|
+
#
|
|
35
|
+
def alphabet
|
|
36
|
+
read_attr :alphabet
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
##
|
|
40
|
+
# @param [String] other the phonemic/phonetic alphabet
|
|
41
|
+
#
|
|
42
|
+
def alphabet=(other)
|
|
43
|
+
self[:alphabet] = other
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
##
|
|
47
|
+
# Specifies the phoneme/phone string.
|
|
48
|
+
#
|
|
49
|
+
# @return [String]
|
|
50
|
+
#
|
|
51
|
+
def ph
|
|
52
|
+
read_attr :ph
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
##
|
|
56
|
+
# @param [String] other the phoneme/phone string.
|
|
57
|
+
#
|
|
58
|
+
def ph=(other)
|
|
59
|
+
self[:ph] = other
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def <<(arg)
|
|
63
|
+
raise InvalidChildError, "A Phoneme can only accept Strings as children" unless VALID_CHILD_TYPES.include? arg.class
|
|
64
|
+
super
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def eql?(o)
|
|
68
|
+
super o, :alphabet, :ph
|
|
69
|
+
end
|
|
70
|
+
end # Phoneme
|
|
71
|
+
end # SSML
|
|
72
|
+
end # RubySpeech
|