treat 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/lib/treat/config/data/core.rb +3 -1
- data/lib/treat/config/data/languages/agnostic.rb +1 -1
- data/lib/treat/core/dsl.rb +12 -44
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/name_tag/stanford.rb +1 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +1 -1
- data/lib/treat/workers/formatters/readers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/readers/html.rb +4 -2
- data/lib/treat/workers/formatters/serializers/xml.rb +1 -1
- data/lib/treat/workers/groupable.rb +1 -3
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +3 -2
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +12 -2
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +2 -1
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +3 -1
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +4 -5
- data/spec/entities/collection.rb +2 -2
- data/spec/entities/entity.rb +4 -4
- data/spec/helper.rb +16 -68
- data/spec/{core → learning}/data_set.rb +0 -0
- data/spec/{core → learning}/export.rb +0 -0
- data/spec/{core → learning}/problem.rb +0 -0
- data/spec/{core → learning}/question.rb +0 -0
- data/spec/sandbox.rb +14 -3
- data/spec/workers/agnostic.rb +80 -30
- data/spec/workers/english.rb +475 -190
- metadata +6 -11
- data/files/21552208.html +0 -792
- data/files/nethttp-cheat-sheet-2940.html +0 -392
- data/lib/treat/config/data/config.rb +0 -50
- data/spec/workers/language.rb +0 -280
- data/spec/workers.rb +0 -28
data/spec/workers/english.rb
CHANGED
@@ -1,194 +1,479 @@
|
|
1
|
-
|
1
|
+
require 'rspec'
|
2
2
|
|
3
|
-
|
3
|
+
require_relative '../../lib/treat'
|
4
|
+
include Treat::Core::DSL
|
5
|
+
|
6
|
+
=begin
|
7
|
+
Treat.libraries.stanford.model_path = '/ruby/stanford/stanford-core-nlp-all/'
|
8
|
+
Treat.libraries.stanford.jar_path = '/ruby/stanford/stanford-core-nlp-all/'
|
9
|
+
Treat.libraries.punkt.model_path = '/ruby/punkt/'
|
10
|
+
Treat.libraries.reuters.model_path = '/ruby/reuters/'
|
11
|
+
=end
|
12
|
+
|
13
|
+
class English
|
14
|
+
|
15
|
+
$workers = Treat.languages.english.workers
|
16
|
+
Treat.core.language.default = 'english'
|
17
|
+
Treat.core.language.detect = false
|
18
|
+
|
19
|
+
describe Treat::Workers::Processors::Segmenters do
|
20
|
+
|
21
|
+
before do
|
22
|
+
@zones = ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.",
|
23
|
+
"Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
|
24
|
+
@groups = [
|
25
|
+
["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."],
|
26
|
+
["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
|
27
|
+
]
|
28
|
+
end
|
4
29
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
30
|
+
context "when #segment is called on a zone" do
|
31
|
+
it "segments the zone into groups" do
|
32
|
+
$workers.processors.segmenters.each do |segmenter|
|
33
|
+
@zones.map { |zone| zone.segment(segmenter) }
|
34
|
+
.map { |zone| zone.groups.map(&:to_s) }
|
35
|
+
.should eql @groups
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe Treat::Workers::Processors::Tokenizers do
|
42
|
+
|
43
|
+
before do
|
44
|
+
@groups = [
|
45
|
+
"Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.",
|
46
|
+
"The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.",
|
47
|
+
"Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.",
|
48
|
+
"These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.",
|
49
|
+
'"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.'
|
50
|
+
]
|
51
|
+
@tokens = [
|
52
|
+
["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed",
|
53
|
+
"to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."],
|
54
|
+
["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber",
|
55
|
+
"de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted",
|
56
|
+
"from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";",
|
57
|
+
"De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders",
|
58
|
+
"and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."],
|
59
|
+
["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist",
|
60
|
+
"Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost",
|
61
|
+
"parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."],
|
62
|
+
["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying",
|
63
|
+
"objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors",
|
64
|
+
",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th",
|
65
|
+
"century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he",
|
66
|
+
"describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."],
|
67
|
+
["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the",
|
68
|
+
"laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise",
|
69
|
+
",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a",
|
70
|
+
"globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]
|
71
|
+
]
|
72
|
+
end
|
73
|
+
context "when #tokenize is called on a group" do
|
74
|
+
it "separates the group into tokens" do
|
75
|
+
$workers.processors.tokenizers.each do |tokenizer|
|
76
|
+
@groups.dup.map { |text| group(text).tokenize(tokenizer) }
|
77
|
+
.map { |group| group.tokens.map(&:to_s) }
|
78
|
+
.should eql @tokens
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe Treat::Workers::Processors::Parsers do
|
85
|
+
before do
|
86
|
+
@groups = ["A sentence to tokenize."]
|
87
|
+
@phrases = [["A sentence to tokenize.", "A sentence", "to tokenize", "tokenize"]]
|
88
|
+
end
|
89
|
+
context "when #parse is called on a group" do
|
90
|
+
it "tokenizes and parses the group into its syntactical phrases" do
|
91
|
+
$workers.processors.parsers.each do |parser|
|
92
|
+
@groups.dup.map { |text| group(text).parse(parser) }
|
93
|
+
.map { |group| group.phrases.map(&:to_s)}
|
94
|
+
.should eql @phrases
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
describe Treat::Workers::Lexicalizers::Taggers do
|
101
|
+
before do
|
102
|
+
@groups = ["I was running"]
|
103
|
+
@group_tags = [["PRP", "VBD", "VBG"]]
|
104
|
+
@tokens = ["running", "man", "2", ".", "$"]
|
105
|
+
@token_tags = ["VBG", "NN", "CD", ".", "$"]
|
106
|
+
end
|
107
|
+
context "when #tag is is called on a tokenized group" do
|
108
|
+
it "annotates each token in the group with its tag and returns the tag 'G'" do
|
109
|
+
$workers.lexicalizers.taggers.each do |tagger|
|
110
|
+
@groups.map { |txt| group(txt).tag(tagger) }
|
111
|
+
.all? { |tag| tag == 'G' }.should be_true
|
112
|
+
@groups.map { |txt| group(txt).tokenize }
|
113
|
+
.map { |g| g.tokens.map(&:tag) }
|
114
|
+
.should eql @group_tags
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
context "when #tag is called on a token" do
|
119
|
+
it "annotates the token with its tag and returns it" do
|
120
|
+
$workers.lexicalizers.taggers.each do |tagger|
|
121
|
+
@tokens.map { |tok| token(tok).tag(tagger) }
|
122
|
+
.should eql @token_tags
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
describe Treat::Workers::Lexicalizers::Sensers do
|
129
|
+
before do
|
130
|
+
@words = ["throw", "weak", "table", "furniture"]
|
131
|
+
@hyponyms = [
|
132
|
+
["slam", "flap down", "ground", "prostrate", "hurl", "hurtle",
|
133
|
+
"cast", "heave", "pelt", "bombard", "defenestrate", "deliver",
|
134
|
+
"pitch", "shy", "drive", "deep-six", "throw overboard", "ridge",
|
135
|
+
"jettison", "fling", "lob", "chuck", "toss", "skim", "skip",
|
136
|
+
"skitter", "juggle", "flip", "flick", "pass", "shed", "molt",
|
137
|
+
"exuviate", "moult", "slough", "abscise", "exfoliate", "autotomize",
|
138
|
+
"autotomise", "pop", "switch on", "turn on", "switch off", "cut",
|
139
|
+
"turn off", "turn out", "shoot", "demoralize", "perplex", "vex",
|
140
|
+
"stick", "get", "puzzle", "mystify", "baffle", "beat", "pose",
|
141
|
+
"bewilder", "disorient", "disorientate"],
|
142
|
+
[],
|
143
|
+
["correlation table", "contents", "table of contents", "actuarial table",
|
144
|
+
"statistical table", "calendar", "file allocation table", "periodic table",
|
145
|
+
"altar", "communion table", "Lord's table", "booth", "breakfast table",
|
146
|
+
"card table", "coffee table", "cocktail table", "conference table",
|
147
|
+
"council table", "council board", "console table", "console", "counter",
|
148
|
+
"desk", "dressing table", "dresser", "vanity", "toilet table", "drop-leaf table",
|
149
|
+
"gaming table", "gueridon", "kitchen table", "operating table", "Parsons table",
|
150
|
+
"pedestal table", "pier table", "platen", "pool table", "billiard table",
|
151
|
+
"snooker table", "stand", "table-tennis table", "ping-pong table",
|
152
|
+
"pingpong table", "tea table", "trestle table", "worktable", "work table",
|
153
|
+
"dining table", "board", "training table"],
|
154
|
+
["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe",
|
155
|
+
"bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers",
|
156
|
+
"chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment",
|
157
|
+
"hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat",
|
158
|
+
"sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe",
|
159
|
+
"closet", "press", "washstand", "wash-hand stand"]
|
160
|
+
]
|
161
|
+
@hypernyms = [
|
162
|
+
["propel", "impel", "move", "remove", "take", "take away", "withdraw",
|
163
|
+
"put", "set", "place", "pose", "position", "lay", "communicate",
|
164
|
+
"intercommunicate", "engage", "mesh", "lock", "operate", "send",
|
165
|
+
"direct", "upset", "discompose", "untune", "disconcert", "discomfit",
|
166
|
+
"express", "verbalize", "verbalise", "utter", "give tongue to", "shape",
|
167
|
+
"form", "work", "mold", "mould", "forge", "dislodge", "bump", "turn", "release", "be"],
|
168
|
+
[],
|
169
|
+
["array", "furniture", "piece of furniture", "article of furniture",
|
170
|
+
"tableland", "plateau", "gathering", "assemblage", "fare"],
|
171
|
+
["furnishing"]
|
172
|
+
]
|
173
|
+
@antonyms = [[], ["strong"], [], []]
|
174
|
+
@synonyms = [
|
175
|
+
["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away",
|
176
|
+
"drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder",
|
177
|
+
"bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox",
|
178
|
+
"befuddle", "fuddle", "bedevil", "confound"],
|
179
|
+
["weak", "watery", "washy", "unaccented", "light", "fallible", "frail", "imperfect",
|
180
|
+
"decrepit", "debile", "feeble", "infirm", "rickety", "sapless", "weakly", "faint"],
|
181
|
+
["table", "tabular array", "mesa", "board"],
|
182
|
+
["furniture", "piece of furniture", "article of furniture"]
|
183
|
+
]
|
184
|
+
end
|
185
|
+
|
186
|
+
context "when #synonym is called on a word, or #sense is "+
|
187
|
+
"called on a word with option :nym set to 'hyponyms'" do
|
188
|
+
it "returns the hyponyms of the word" do
|
189
|
+
$workers.lexicalizers.sensers.each do |senser|
|
190
|
+
@words.map { |txt| word(txt) }
|
191
|
+
.map { |wrd| wrd.hyponyms(senser) }.should eql @hyponyms
|
192
|
+
@words.map { |txt| word(txt) }
|
193
|
+
.map { |wrd| wrd.sense(nym: 'hyponyms') }
|
194
|
+
.should eql @hyponyms
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
context "when #hypernyms is called on a word or #sense is "+
|
200
|
+
"called on a word with option :nym set to 'hyponyms'" do
|
201
|
+
it "returns the hyponyms of the word" do
|
202
|
+
$workers.lexicalizers.sensers.each do |senser|
|
203
|
+
@words.map { |txt| word(txt) }
|
204
|
+
.map { |wrd| wrd.hypernyms(senser) }.should eql @hypernyms
|
205
|
+
@words.map { |txt| word(txt) }
|
206
|
+
.map { |wrd| wrd.sense(senser, nym: 'hypernyms') }
|
207
|
+
.should eql @hypernyms
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
context "when #antonyms is called on a word or #sense is" +
|
213
|
+
"called on a word with option :nym set to 'antonyms'" do
|
214
|
+
it "returns the hyponyms of the word" do
|
215
|
+
$workers.lexicalizers.sensers.each do |senser|
|
216
|
+
@words.map { |txt| word(txt) }
|
217
|
+
.map { |wrd| wrd.antonyms(senser) }.should eql @antonyms
|
218
|
+
@words.map { |txt| word(txt) }
|
219
|
+
.map { |wrd| wrd.sense(senser, nym: 'antonyms') }
|
220
|
+
.should eql @antonyms
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
context "when #synonyms is called on a word or #sense is" +
|
226
|
+
"called on a word with option :nym set to 'synonyms'" do
|
227
|
+
it "returns the hyponyms of the word" do
|
228
|
+
$workers.lexicalizers.sensers.each do |senser|
|
229
|
+
@words.map { |txt| word(txt) }
|
230
|
+
.map { |wrd| wrd.synonyms(senser) }.should eql @synonyms
|
231
|
+
@words.map { |txt| word(txt) }
|
232
|
+
.map { |wrd| wrd.sense(senser, nym: 'synonyms') }
|
233
|
+
.should eql @synonyms
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
describe Treat::Workers::Lexicalizers::Categorizers do
|
241
|
+
|
242
|
+
before do
|
243
|
+
@phrase = "I was running"
|
244
|
+
@fragment = "world. Hello"
|
245
|
+
@sentence = "I am running."
|
246
|
+
@group_categories = ["phrase",
|
247
|
+
"fragment", "sentence"]
|
248
|
+
@tokens = ["running"]
|
249
|
+
@token_tags = ["verb"]
|
250
|
+
end
|
251
|
+
|
252
|
+
context "when #category is called on a tokenized and tagged group" do
|
253
|
+
it "returns a tag corresponding to the group name" do
|
254
|
+
$workers.lexicalizers.categorizers.each do |categorizer|
|
255
|
+
[phrase(@phrase), fragment(@fragment), sentence(@sentence)]
|
256
|
+
.map { |grp| grp.apply(:tag).category(categorizer) }
|
257
|
+
.should eql @group_categories
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
context "when #category is called called on a tagged token" do
|
263
|
+
it "returns the category corresponding to the token's tag" do
|
264
|
+
$workers.lexicalizers.categorizers.each do |categorizer|
|
265
|
+
@tokens.map { |tok| token(tok).apply(:tag).category(categorizer) }
|
266
|
+
.should eql @token_tags
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
end
|
193
272
|
|
273
|
+
describe Treat::Workers::Inflectors::Ordinalizers,
|
274
|
+
Treat::Workers::Inflectors::Cardinalizers do
|
275
|
+
|
276
|
+
before do
|
277
|
+
@numbers = [1, 2, 3]
|
278
|
+
@ordinal = ["first", "second", "third"]
|
279
|
+
@cardinal = ["one", "two", "three"]
|
280
|
+
end
|
281
|
+
|
282
|
+
context "when #ordinal is called on a number" do
|
283
|
+
it "returns the ordinal form (e.g. 'first') of the number" do
|
284
|
+
$workers.inflectors.ordinalizers.each do |ordinalizer|
|
285
|
+
@numbers.map { |num| number(num) }
|
286
|
+
.map { |num| num.ordinal(ordinalizer) }.should eql @ordinal
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
context "when #cardinal is called on a number" do
|
292
|
+
it "returns the cardinal form (e.g. 'second' of the number)" do
|
293
|
+
$workers.inflectors.cardinalizers.each do |cardinalizer|
|
294
|
+
@numbers.map { |num| number(num) }
|
295
|
+
.map { |num| num.cardinal(cardinalizer) }.should eql @cardinal
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
301
|
+
|
302
|
+
describe Treat::Workers::Inflectors::Stemmers do
|
303
|
+
before do
|
304
|
+
@words = ["running"]
|
305
|
+
@stems = ["run"]
|
306
|
+
end
|
307
|
+
context "when #stem is called on a word" do
|
308
|
+
it "annotates the word with its stem and returns the stem" do
|
309
|
+
$workers.inflectors.stemmers.each do |stemmer|
|
310
|
+
@words.map { |wrd| wrd.stem(stemmer) }.should eql @stems
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
describe Treat::Workers::Extractors::NameTag do
|
317
|
+
before do
|
318
|
+
@groups = ["Obama and Sarkozy will meet in Berlin."]
|
319
|
+
@tags = [["person", nil, "person", nil, nil, nil, "location", nil]]
|
320
|
+
end
|
321
|
+
|
322
|
+
context "when #name_tag called on a tokenized group" do
|
323
|
+
it "tags each token with its name tag" do
|
324
|
+
$workers.extractors.name_tag.each do |tagger|
|
325
|
+
@groups.map { |grp| grp.tokenize.apply(:name_tag) }
|
326
|
+
.map { |grp| grp.tokens.map { |t| t.get(:name_tag) } }
|
327
|
+
.should eql @tags
|
328
|
+
end
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
end
|
333
|
+
|
334
|
+
describe Treat::Workers::Extractors::Topics do
|
335
|
+
before do
|
336
|
+
@files = ["./spec/workers/examples/english/test.txt"]
|
337
|
+
@topics = [['household goods and hardware',
|
338
|
+
'united states of america', 'corporate/industrial']]
|
339
|
+
end
|
340
|
+
context "when #topics is called on a chunked, segmented and tokenized document" do
|
341
|
+
it "annotates the document with its general topics and returns them" do
|
342
|
+
$workers.extractors.topics.each do |extractor|
|
343
|
+
@files.map { |f| document(f).apply(:chunk, :segment, :tokenize) }
|
344
|
+
.map { |doc| doc.topics }.should eql @topics
|
345
|
+
end
|
346
|
+
end
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
describe Treat::Workers::Extractors::Time do
|
351
|
+
before do
|
352
|
+
@expressions = ["october 2006"]
|
353
|
+
@months = [10]
|
354
|
+
end
|
355
|
+
context "when called on a tokenized group representing a time expression" do
|
356
|
+
it "returns the DateTime object corresponding to the time" do
|
357
|
+
$workers.extractors.time.each do |extractor|
|
358
|
+
puts @expressions.map(&:time).inspect
|
359
|
+
@expressions.map(&:time).all? { |time| time
|
360
|
+
.is_a?(DateTime) }.should be_true
|
361
|
+
@expressions.map(&:time).map { |time| time.month }
|
362
|
+
.should eql @months
|
363
|
+
end
|
364
|
+
end
|
365
|
+
end
|
194
366
|
end
|
367
|
+
|
368
|
+
describe Treat::Workers::Extractors::TopicWords do
|
369
|
+
|
370
|
+
before do
|
371
|
+
@collections = ["./spec/workers/examples/english/economist"]
|
372
|
+
@topic_words = [["euro", "zone", "european", "mrs", "greece", "chancellor",
|
373
|
+
"berlin", "practice", "german", "germans"], ["bank", "minister", "central",
|
374
|
+
"bajnai", "mr", "hu", "orban", "commission", "hungarian", "government"],
|
375
|
+
["bank", "mr", "central", "bajnai", "prime", "government", "brussels",
|
376
|
+
"responsibility", "national", "independence"], ["mr", "bank", "central",
|
377
|
+
"policies", "prime", "minister", "today", "financial", "government", "funds"],
|
378
|
+
["euro", "merkel", "mr", "zone", "european", "greece", "german", "berlin",
|
379
|
+
"sarkozy", "government"], ["mr", "bajnai", "today", "orban", "government",
|
380
|
+
"forced", "independence", "part", "hand", "minister"], ["sarkozy", "mrs",
|
381
|
+
"zone", "euro", "fiscal", "called", "greece", "merkel", "german", "financial"],
|
382
|
+
["mr", "called", "central", "policies", "financial", "bank", "european",
|
383
|
+
"prime", "minister", "shift"], ["bajnai", "orban", "prime", "mr", "government",
|
384
|
+
"independence", "forced", "commission", "-", "hvg"], ["euro", "sarkozy", "fiscal",
|
385
|
+
"merkel", "mr", "chancellor", "european", "german", "agenda", "soap"], ["mr",
|
386
|
+
"bank", "called", "central", "today", "prime", "government", "minister", "european",
|
387
|
+
"crisis"], ["mr", "fiscal", "mrs", "sarkozy", "merkel", "euro", "summit", "tax",
|
388
|
+
"leaders", "ecb"], ["called", "government", "financial", "policies", "part", "bank",
|
389
|
+
"central", "press", "mr", "president"], ["sarkozy", "merkel", "euro", "mr", "summit",
|
390
|
+
"mrs", "fiscal", "merkozy", "economic", "german"], ["mr", "prime", "minister",
|
391
|
+
"policies", "government", "financial", "crisis", "bank", "called", "part"], ["mr",
|
392
|
+
"bank", "government", "today", "called", "central", "minister", "prime", "issues",
|
393
|
+
"president"], ["mr", "orban", "central", "government", "parliament", "hungarian",
|
394
|
+
"minister", "hu", "personal", "bajnai"], ["government", "called", "central", "european",
|
395
|
+
"today", "bank", "prime", "financial", "part", "deficit"], ["mr", "orban", "government",
|
396
|
+
"hungarian", "bank", "hvg", "minister", "-", "fidesz", "hand"], ["mr", "bank", "european",
|
397
|
+
"minister", "policies", "crisis", "government", "president", "called", "shift"]]
|
398
|
+
end
|
399
|
+
|
400
|
+
context "when #topic_words is called on a chunked, segmented and tokenized collection" do
|
401
|
+
it "annotates the collection with the topic words and returns them" do
|
402
|
+
$workers.extractors.topic_words.each do |extractor|
|
403
|
+
@collections.map(&method(:collection))
|
404
|
+
.map { |col| col.apply(:chunk,:segment,:tokenize) }
|
405
|
+
map { |col| col.topic_words }.should eql @topic_words
|
406
|
+
end
|
407
|
+
end
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
describe Treat::Workers::Inflectors::Conjugators do
|
412
|
+
before do
|
413
|
+
@infinitives = ["run"]
|
414
|
+
@participles = ["running"]
|
415
|
+
end
|
416
|
+
|
417
|
+
context "when #present_participle is called on a word or #conjugate " +
|
418
|
+
"is called on a word with option :form set to 'present_participle'" do
|
419
|
+
it "returns the present participle form of the verb" do
|
420
|
+
$workers.inflectors.conjugators.each do |conjugator|
|
421
|
+
@participles.map { |verb| verb
|
422
|
+
.infinitive(conjugator) }
|
423
|
+
.should eql @infinitives
|
424
|
+
@participles.map { |verb| verb.conjugate(
|
425
|
+
conjugator, form: 'infinitive') }
|
426
|
+
.should eql @infinitives
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
context "when #infinitive is called on a word or #conjugate is " +
|
432
|
+
"called on a word with option :form set to 'infinitive'" do
|
433
|
+
it "returns the infinitive form of the verb" do
|
434
|
+
$workers.inflectors.conjugators.each do |conjugator|
|
435
|
+
@infinitives.map { |verb| verb
|
436
|
+
.present_participle(conjugator) }
|
437
|
+
.should eql @participles
|
438
|
+
@infinitives.map { |verb| verb.conjugate(
|
439
|
+
conjugator, form: 'present_participle') }
|
440
|
+
.should eql @participles
|
441
|
+
end
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
end
|
446
|
+
|
447
|
+
describe Treat::Workers::Inflectors::Declensors do
|
448
|
+
before do
|
449
|
+
@singulars = ["man"]
|
450
|
+
@plurals = ["men"]
|
451
|
+
end
|
452
|
+
context "when #plural is called on a word, or #declense "+
|
453
|
+
"is called on a word with option :count set to 'plural'" do
|
454
|
+
it "returns the plural form of the word" do
|
455
|
+
$workers.inflectors.declensors.each do |declensor|
|
456
|
+
@singulars.map { |word| word.plural(declensor) }
|
457
|
+
.should eql @plurals
|
458
|
+
@singulars.map { |word| word
|
459
|
+
.declense(declensor, count: 'plural') }
|
460
|
+
.should eql @plurals
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
context "when #singular is called on a word, or #declense " +
|
465
|
+
"is called on a word with option :count set to 'singular'" do
|
466
|
+
it "returns the singular form of the word" do
|
467
|
+
$workers.inflectors.declensors.each do |declensor|
|
468
|
+
next if declensor == :linguistics
|
469
|
+
@plurals.map { |word| word.singular(declensor) }
|
470
|
+
.should eql @singulars
|
471
|
+
@singulars.map { |word| word
|
472
|
+
.declense(declensor, count: 'singular') }
|
473
|
+
.should eql @singulars
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
end
|