treat 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,194 +1,479 @@
1
- class Treat::Specs::Workers::English < Treat::Specs::Workers::Language
1
+ require 'rspec'
2
2
 
3
- # TODO: parse
3
+ require_relative '../../lib/treat'
4
+ include Treat::Core::DSL
5
+
6
+ =begin
7
+ Treat.libraries.stanford.model_path = '/ruby/stanford/stanford-core-nlp-all/'
8
+ Treat.libraries.stanford.jar_path = '/ruby/stanford/stanford-core-nlp-all/'
9
+ Treat.libraries.punkt.model_path = '/ruby/punkt/'
10
+ Treat.libraries.reuters.model_path = '/ruby/reuters/'
11
+ =end
12
+
13
+ class English
14
+
15
+ $workers = Treat.languages.english.workers
16
+ Treat.core.language.default = 'english'
17
+ Treat.core.language.detect = false
18
+
19
+ describe Treat::Workers::Processors::Segmenters do
20
+
21
+ before do
22
+ @zones = ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.",
23
+ "Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
24
+ @groups = [
25
+ ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."],
26
+ ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
27
+ ]
28
+ end
4
29
 
5
- Scenarios = {
6
- tokenize: {
7
- group: {
8
- examples: [
9
- ["Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.", ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed", "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."]],
10
- ["The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.", ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber", "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted", "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";", "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders", "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."]],
11
- ["Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.", ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist", "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost", "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."]],
12
- ["These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.", ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying", "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors", ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th", "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he", "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."]],
13
- ['"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.', ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the", "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise", ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a", "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]]
14
- ],
15
- generator: lambda { |entity| entity.tokens.map { |tok| tok.to_s } }
16
- }
17
- },
18
- parse: {
19
- group: {
20
- examples: [
21
- ["A sentence to tokenize.", ["A sentence to tokenize.", "A sentence", "to tokenize",
22
- "tokenize"]]
23
- ],
24
- generator: lambda { |group| group.phrases.map { |phrase| phrase.to_s } }
25
- }
26
- },
27
- segment: {
28
- zone: {
29
- examples: [
30
- ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.", ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."]],
31
- ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM.", ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]]
32
- ],
33
- generator: lambda { |entity| entity.sentences.map { |sent| sent.to_s } }
34
- }
35
- },
36
- tag: {
37
- phrase: {
38
- examples: [
39
- ["I was running", "P"]
40
- ]
41
- },
42
- token: {
43
- examples: [
44
- ["running", "VBG"],
45
- ["man", "NN"],
46
- ["2", "CD"],
47
- [".", "."],
48
- ["$", "$"]
49
- ]
50
- }
51
- },
52
- category: {
53
- phrase: {
54
- examples: [
55
- ["I was running", "phrase"]
56
- ]
57
- },
58
- token: {
59
- examples: [
60
- ["running", "verb"]
61
- ]
62
- }
63
- },
64
- ordinal: {
65
- word: {
66
- examples: [
67
- ["20", "twentieth"]
68
- ]
69
- },
70
- number: {
71
- examples: [
72
- [20, "twentieth"]
73
- ]
74
- }
75
- },
76
- cardinal: {
77
- word: {
78
- examples: [
79
- ['20', "twenty"]
80
- ]
81
- },
82
- number: {
83
- examples: [
84
- [20, "twenty"]
85
- ]
86
- }
87
- },
88
- name_tag: {
89
- group: {
90
- examples: [
91
- ["Obama and Sarkozy will meet in Berlin.", ["person", nil, "person", nil, nil, nil, "location"]]
92
- ],
93
- preprocessor: lambda { |group| group.tokenize },
94
- generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
95
- }
96
- },
97
- language: { ######
98
- entity: {
99
- examples: [
100
- ["Obama and Sarkozy will meet in Berlin.", "english"]
101
- ],
102
- preprocessor: lambda { |entity| Treat.core.language.detect = true; entity.do(:tokenize); entity },
103
- postprocessor: lambda { |entity| Treat.core.language.detect = false; entity; },
104
- generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
105
- }
106
- },
107
- stem: {
108
- word: {
109
- examples: [
110
- ["running", "run"]
111
- ]
112
- }
113
- },
114
- time: {
115
- group: {
116
- examples: [
117
- ['october 2006', 10]
118
- ],
119
- generator: lambda { |entity| entity.time.month }
120
- }
121
- },
122
- topics: {
123
- document: {
124
- examples: [
125
- ["./spec/workers/examples/english/test.txt",
126
- ['household goods and hardware',
127
- 'united states of america',
128
- 'corporate/industrial']]
129
- ],
130
- preprocessor: lambda { |doc| doc.do :chunk, :segment, :tokenize }
131
- },
132
- section: {
133
- # Must implement
134
- },
135
- zone: {
136
- examples: [
137
- ["Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.", ['household goods and hardware', 'united states of america', 'corporate/industrial']]
138
- ],
139
- preprocessor: lambda { |zone| zone.do :segment, :tokenize }
140
- }
141
- },
142
- topic_words: {
143
- collection: {
144
- examples: [
145
- ["./perf/examples/economist", [""]]
146
- ],
147
- preprocessor: lambda { |coll| coll.do :chunk, :segment, :tokenize }
148
- }
149
- },
150
- conjugate: {
151
- word: {
152
- examples: {
153
- present_participle: [
154
- ["run", "running"]
155
- ],
156
- infinitive: [
157
- ["running", "run"]
158
- ]
159
- }
160
- }
161
- },
162
- declense: {
163
- word: {
164
- examples: {
165
- singular: [
166
- ["men", "man"]
167
- ],
168
- plural: [
169
- ["man", "men"]
170
- ]
171
- }
172
- }
173
- },
174
- sense: {
175
- word: {
176
- examples: {
177
- synonyms: [
178
- ["throw", ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away", "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder", "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox", "befuddle", "fuddle", "bedevil", "confound"]]
179
- ],
180
- antonyms: [
181
- ["weak", ["strong"]]
182
- ],
183
- hypernyms: [
184
- ["table", ["array", "furniture", "piece of furniture", "article of furniture", "tableland", "plateau", "gathering", "assemblage", "fare"]]
185
- ],
186
- hyponyms: [
187
- ["furniture", ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe", "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers", "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment", "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat", "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe", "closet", "press", "washstand", "wash-hand stand"]]
188
- ]
189
- }
190
- }
191
- },
192
- }
30
+ context "when #segment is called on a zone" do
31
+ it "segments the zone into groups" do
32
+ $workers.processors.segmenters.each do |segmenter|
33
+ @zones.map { |zone| zone.segment(segmenter) }
34
+ .map { |zone| zone.groups.map(&:to_s) }
35
+ .should eql @groups
36
+ end
37
+ end
38
+ end
39
+ end
40
+
41
+ describe Treat::Workers::Processors::Tokenizers do
42
+
43
+ before do
44
+ @groups = [
45
+ "Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.",
46
+ "The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.",
47
+ "Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.",
48
+ "These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.",
49
+ '"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.'
50
+ ]
51
+ @tokens = [
52
+ ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed",
53
+ "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."],
54
+ ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber",
55
+ "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted",
56
+ "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";",
57
+ "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders",
58
+ "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."],
59
+ ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist",
60
+ "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost",
61
+ "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."],
62
+ ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying",
63
+ "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors",
64
+ ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th",
65
+ "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he",
66
+ "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."],
67
+ ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the",
68
+ "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise",
69
+ ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a",
70
+ "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]
71
+ ]
72
+ end
73
+ context "when #tokenize is called on a group" do
74
+ it "separates the group into tokens" do
75
+ $workers.processors.tokenizers.each do |tokenizer|
76
+ @groups.dup.map { |text| group(text).tokenize(tokenizer) }
77
+ .map { |group| group.tokens.map(&:to_s) }
78
+ .should eql @tokens
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ describe Treat::Workers::Processors::Parsers do
85
+ before do
86
+ @groups = ["A sentence to tokenize."]
87
+ @phrases = [["A sentence to tokenize.", "A sentence", "to tokenize", "tokenize"]]
88
+ end
89
+ context "when #parse is called on a group" do
90
+ it "tokenizes and parses the group into its syntactical phrases" do
91
+ $workers.processors.parsers.each do |parser|
92
+ @groups.dup.map { |text| group(text).parse(parser) }
93
+ .map { |group| group.phrases.map(&:to_s)}
94
+ .should eql @phrases
95
+ end
96
+ end
97
+ end
98
+ end
99
+
100
+ describe Treat::Workers::Lexicalizers::Taggers do
101
+ before do
102
+ @groups = ["I was running"]
103
+ @group_tags = [["PRP", "VBD", "VBG"]]
104
+ @tokens = ["running", "man", "2", ".", "$"]
105
+ @token_tags = ["VBG", "NN", "CD", ".", "$"]
106
+ end
107
+ context "when #tag is is called on a tokenized group" do
108
+ it "annotates each token in the group with its tag and returns the tag 'G'" do
109
+ $workers.lexicalizers.taggers.each do |tagger|
110
+ @groups.map { |txt| group(txt).tag(tagger) }
111
+ .all? { |tag| tag == 'G' }.should be_true
112
+ @groups.map { |txt| group(txt).tokenize }
113
+ .map { |g| g.tokens.map(&:tag) }
114
+ .should eql @group_tags
115
+ end
116
+ end
117
+ end
118
+ context "when #tag is called on a token" do
119
+ it "annotates the token with its tag and returns it" do
120
+ $workers.lexicalizers.taggers.each do |tagger|
121
+ @tokens.map { |tok| token(tok).tag(tagger) }
122
+ .should eql @token_tags
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ describe Treat::Workers::Lexicalizers::Sensers do
129
+ before do
130
+ @words = ["throw", "weak", "table", "furniture"]
131
+ @hyponyms = [
132
+ ["slam", "flap down", "ground", "prostrate", "hurl", "hurtle",
133
+ "cast", "heave", "pelt", "bombard", "defenestrate", "deliver",
134
+ "pitch", "shy", "drive", "deep-six", "throw overboard", "ridge",
135
+ "jettison", "fling", "lob", "chuck", "toss", "skim", "skip",
136
+ "skitter", "juggle", "flip", "flick", "pass", "shed", "molt",
137
+ "exuviate", "moult", "slough", "abscise", "exfoliate", "autotomize",
138
+ "autotomise", "pop", "switch on", "turn on", "switch off", "cut",
139
+ "turn off", "turn out", "shoot", "demoralize", "perplex", "vex",
140
+ "stick", "get", "puzzle", "mystify", "baffle", "beat", "pose",
141
+ "bewilder", "disorient", "disorientate"],
142
+ [],
143
+ ["correlation table", "contents", "table of contents", "actuarial table",
144
+ "statistical table", "calendar", "file allocation table", "periodic table",
145
+ "altar", "communion table", "Lord's table", "booth", "breakfast table",
146
+ "card table", "coffee table", "cocktail table", "conference table",
147
+ "council table", "council board", "console table", "console", "counter",
148
+ "desk", "dressing table", "dresser", "vanity", "toilet table", "drop-leaf table",
149
+ "gaming table", "gueridon", "kitchen table", "operating table", "Parsons table",
150
+ "pedestal table", "pier table", "platen", "pool table", "billiard table",
151
+ "snooker table", "stand", "table-tennis table", "ping-pong table",
152
+ "pingpong table", "tea table", "trestle table", "worktable", "work table",
153
+ "dining table", "board", "training table"],
154
+ ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe",
155
+ "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers",
156
+ "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment",
157
+ "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat",
158
+ "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe",
159
+ "closet", "press", "washstand", "wash-hand stand"]
160
+ ]
161
+ @hypernyms = [
162
+ ["propel", "impel", "move", "remove", "take", "take away", "withdraw",
163
+ "put", "set", "place", "pose", "position", "lay", "communicate",
164
+ "intercommunicate", "engage", "mesh", "lock", "operate", "send",
165
+ "direct", "upset", "discompose", "untune", "disconcert", "discomfit",
166
+ "express", "verbalize", "verbalise", "utter", "give tongue to", "shape",
167
+ "form", "work", "mold", "mould", "forge", "dislodge", "bump", "turn", "release", "be"],
168
+ [],
169
+ ["array", "furniture", "piece of furniture", "article of furniture",
170
+ "tableland", "plateau", "gathering", "assemblage", "fare"],
171
+ ["furnishing"]
172
+ ]
173
+ @antonyms = [[], ["strong"], [], []]
174
+ @synonyms = [
175
+ ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away",
176
+ "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder",
177
+ "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox",
178
+ "befuddle", "fuddle", "bedevil", "confound"],
179
+ ["weak", "watery", "washy", "unaccented", "light", "fallible", "frail", "imperfect",
180
+ "decrepit", "debile", "feeble", "infirm", "rickety", "sapless", "weakly", "faint"],
181
+ ["table", "tabular array", "mesa", "board"],
182
+ ["furniture", "piece of furniture", "article of furniture"]
183
+ ]
184
+ end
185
+
186
+ context "when #synonym is called on a word, or #sense is "+
187
+ "called on a word with option :nym set to 'hyponyms'" do
188
+ it "returns the hyponyms of the word" do
189
+ $workers.lexicalizers.sensers.each do |senser|
190
+ @words.map { |txt| word(txt) }
191
+ .map { |wrd| wrd.hyponyms(senser) }.should eql @hyponyms
192
+ @words.map { |txt| word(txt) }
193
+ .map { |wrd| wrd.sense(nym: 'hyponyms') }
194
+ .should eql @hyponyms
195
+ end
196
+ end
197
+ end
198
+
199
+ context "when #hypernyms is called on a word or #sense is "+
200
+ "called on a word with option :nym set to 'hyponyms'" do
201
+ it "returns the hyponyms of the word" do
202
+ $workers.lexicalizers.sensers.each do |senser|
203
+ @words.map { |txt| word(txt) }
204
+ .map { |wrd| wrd.hypernyms(senser) }.should eql @hypernyms
205
+ @words.map { |txt| word(txt) }
206
+ .map { |wrd| wrd.sense(senser, nym: 'hypernyms') }
207
+ .should eql @hypernyms
208
+ end
209
+ end
210
+ end
211
+
212
+ context "when #antonyms is called on a word or #sense is" +
213
+ "called on a word with option :nym set to 'antonyms'" do
214
+ it "returns the hyponyms of the word" do
215
+ $workers.lexicalizers.sensers.each do |senser|
216
+ @words.map { |txt| word(txt) }
217
+ .map { |wrd| wrd.antonyms(senser) }.should eql @antonyms
218
+ @words.map { |txt| word(txt) }
219
+ .map { |wrd| wrd.sense(senser, nym: 'antonyms') }
220
+ .should eql @antonyms
221
+ end
222
+ end
223
+ end
224
+
225
+ context "when #synonyms is called on a word or #sense is" +
226
+ "called on a word with option :nym set to 'synonyms'" do
227
+ it "returns the hyponyms of the word" do
228
+ $workers.lexicalizers.sensers.each do |senser|
229
+ @words.map { |txt| word(txt) }
230
+ .map { |wrd| wrd.synonyms(senser) }.should eql @synonyms
231
+ @words.map { |txt| word(txt) }
232
+ .map { |wrd| wrd.sense(senser, nym: 'synonyms') }
233
+ .should eql @synonyms
234
+ end
235
+ end
236
+ end
237
+
238
+ end
239
+
240
+ describe Treat::Workers::Lexicalizers::Categorizers do
241
+
242
+ before do
243
+ @phrase = "I was running"
244
+ @fragment = "world. Hello"
245
+ @sentence = "I am running."
246
+ @group_categories = ["phrase",
247
+ "fragment", "sentence"]
248
+ @tokens = ["running"]
249
+ @token_tags = ["verb"]
250
+ end
251
+
252
+ context "when #category is called on a tokenized and tagged group" do
253
+ it "returns a tag corresponding to the group name" do
254
+ $workers.lexicalizers.categorizers.each do |categorizer|
255
+ [phrase(@phrase), fragment(@fragment), sentence(@sentence)]
256
+ .map { |grp| grp.apply(:tag).category(categorizer) }
257
+ .should eql @group_categories
258
+ end
259
+ end
260
+ end
261
+
262
+ context "when #category is called called on a tagged token" do
263
+ it "returns the category corresponding to the token's tag" do
264
+ $workers.lexicalizers.categorizers.each do |categorizer|
265
+ @tokens.map { |tok| token(tok).apply(:tag).category(categorizer) }
266
+ .should eql @token_tags
267
+ end
268
+ end
269
+ end
270
+
271
+ end
193
272
 
273
+ describe Treat::Workers::Inflectors::Ordinalizers,
274
+ Treat::Workers::Inflectors::Cardinalizers do
275
+
276
+ before do
277
+ @numbers = [1, 2, 3]
278
+ @ordinal = ["first", "second", "third"]
279
+ @cardinal = ["one", "two", "three"]
280
+ end
281
+
282
+ context "when #ordinal is called on a number" do
283
+ it "returns the ordinal form (e.g. 'first') of the number" do
284
+ $workers.inflectors.ordinalizers.each do |ordinalizer|
285
+ @numbers.map { |num| number(num) }
286
+ .map { |num| num.ordinal(ordinalizer) }.should eql @ordinal
287
+ end
288
+ end
289
+ end
290
+
291
+ context "when #cardinal is called on a number" do
292
+ it "returns the cardinal form (e.g. 'second' of the number)" do
293
+ $workers.inflectors.cardinalizers.each do |cardinalizer|
294
+ @numbers.map { |num| number(num) }
295
+ .map { |num| num.cardinal(cardinalizer) }.should eql @cardinal
296
+ end
297
+ end
298
+ end
299
+
300
+ end
301
+
302
+ describe Treat::Workers::Inflectors::Stemmers do
303
+ before do
304
+ @words = ["running"]
305
+ @stems = ["run"]
306
+ end
307
+ context "when #stem is called on a word" do
308
+ it "annotates the word with its stem and returns the stem" do
309
+ $workers.inflectors.stemmers.each do |stemmer|
310
+ @words.map { |wrd| wrd.stem(stemmer) }.should eql @stems
311
+ end
312
+ end
313
+ end
314
+ end
315
+
316
+ describe Treat::Workers::Extractors::NameTag do
317
+ before do
318
+ @groups = ["Obama and Sarkozy will meet in Berlin."]
319
+ @tags = [["person", nil, "person", nil, nil, nil, "location", nil]]
320
+ end
321
+
322
+ context "when #name_tag called on a tokenized group" do
323
+ it "tags each token with its name tag" do
324
+ $workers.extractors.name_tag.each do |tagger|
325
+ @groups.map { |grp| grp.tokenize.apply(:name_tag) }
326
+ .map { |grp| grp.tokens.map { |t| t.get(:name_tag) } }
327
+ .should eql @tags
328
+ end
329
+ end
330
+ end
331
+
332
+ end
333
+
334
+ describe Treat::Workers::Extractors::Topics do
335
+ before do
336
+ @files = ["./spec/workers/examples/english/test.txt"]
337
+ @topics = [['household goods and hardware',
338
+ 'united states of america', 'corporate/industrial']]
339
+ end
340
+ context "when #topics is called on a chunked, segmented and tokenized document" do
341
+ it "annotates the document with its general topics and returns them" do
342
+ $workers.extractors.topics.each do |extractor|
343
+ @files.map { |f| document(f).apply(:chunk, :segment, :tokenize) }
344
+ .map { |doc| doc.topics }.should eql @topics
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+ describe Treat::Workers::Extractors::Time do
351
+ before do
352
+ @expressions = ["october 2006"]
353
+ @months = [10]
354
+ end
355
+ context "when called on a tokenized group representing a time expression" do
356
+ it "returns the DateTime object corresponding to the time" do
357
+ $workers.extractors.time.each do |extractor|
358
+ puts @expressions.map(&:time).inspect
359
+ @expressions.map(&:time).all? { |time| time
360
+ .is_a?(DateTime) }.should be_true
361
+ @expressions.map(&:time).map { |time| time.month }
362
+ .should eql @months
363
+ end
364
+ end
365
+ end
194
366
  end
367
+
368
+ describe Treat::Workers::Extractors::TopicWords do
369
+
370
+ before do
371
+ @collections = ["./spec/workers/examples/english/economist"]
372
+ @topic_words = [["euro", "zone", "european", "mrs", "greece", "chancellor",
373
+ "berlin", "practice", "german", "germans"], ["bank", "minister", "central",
374
+ "bajnai", "mr", "hu", "orban", "commission", "hungarian", "government"],
375
+ ["bank", "mr", "central", "bajnai", "prime", "government", "brussels",
376
+ "responsibility", "national", "independence"], ["mr", "bank", "central",
377
+ "policies", "prime", "minister", "today", "financial", "government", "funds"],
378
+ ["euro", "merkel", "mr", "zone", "european", "greece", "german", "berlin",
379
+ "sarkozy", "government"], ["mr", "bajnai", "today", "orban", "government",
380
+ "forced", "independence", "part", "hand", "minister"], ["sarkozy", "mrs",
381
+ "zone", "euro", "fiscal", "called", "greece", "merkel", "german", "financial"],
382
+ ["mr", "called", "central", "policies", "financial", "bank", "european",
383
+ "prime", "minister", "shift"], ["bajnai", "orban", "prime", "mr", "government",
384
+ "independence", "forced", "commission", "-", "hvg"], ["euro", "sarkozy", "fiscal",
385
+ "merkel", "mr", "chancellor", "european", "german", "agenda", "soap"], ["mr",
386
+ "bank", "called", "central", "today", "prime", "government", "minister", "european",
387
+ "crisis"], ["mr", "fiscal", "mrs", "sarkozy", "merkel", "euro", "summit", "tax",
388
+ "leaders", "ecb"], ["called", "government", "financial", "policies", "part", "bank",
389
+ "central", "press", "mr", "president"], ["sarkozy", "merkel", "euro", "mr", "summit",
390
+ "mrs", "fiscal", "merkozy", "economic", "german"], ["mr", "prime", "minister",
391
+ "policies", "government", "financial", "crisis", "bank", "called", "part"], ["mr",
392
+ "bank", "government", "today", "called", "central", "minister", "prime", "issues",
393
+ "president"], ["mr", "orban", "central", "government", "parliament", "hungarian",
394
+ "minister", "hu", "personal", "bajnai"], ["government", "called", "central", "european",
395
+ "today", "bank", "prime", "financial", "part", "deficit"], ["mr", "orban", "government",
396
+ "hungarian", "bank", "hvg", "minister", "-", "fidesz", "hand"], ["mr", "bank", "european",
397
+ "minister", "policies", "crisis", "government", "president", "called", "shift"]]
398
+ end
399
+
400
+ context "when #topic_words is called on a chunked, segmented and tokenized collection" do
401
+ it "annotates the collection with the topic words and returns them" do
402
+ $workers.extractors.topic_words.each do |extractor|
403
+ @collections.map(&method(:collection))
404
+ .map { |col| col.apply(:chunk,:segment,:tokenize) }
405
+ map { |col| col.topic_words }.should eql @topic_words
406
+ end
407
+ end
408
+ end
409
+ end
410
+
411
+ describe Treat::Workers::Inflectors::Conjugators do
412
+ before do
413
+ @infinitives = ["run"]
414
+ @participles = ["running"]
415
+ end
416
+
417
+ context "when #present_participle is called on a word or #conjugate " +
418
+ "is called on a word with option :form set to 'present_participle'" do
419
+ it "returns the present participle form of the verb" do
420
+ $workers.inflectors.conjugators.each do |conjugator|
421
+ @participles.map { |verb| verb
422
+ .infinitive(conjugator) }
423
+ .should eql @infinitives
424
+ @participles.map { |verb| verb.conjugate(
425
+ conjugator, form: 'infinitive') }
426
+ .should eql @infinitives
427
+ end
428
+ end
429
+ end
430
+
431
+ context "when #infinitive is called on a word or #conjugate is " +
432
+ "called on a word with option :form set to 'infinitive'" do
433
+ it "returns the infinitive form of the verb" do
434
+ $workers.inflectors.conjugators.each do |conjugator|
435
+ @infinitives.map { |verb| verb
436
+ .present_participle(conjugator) }
437
+ .should eql @participles
438
+ @infinitives.map { |verb| verb.conjugate(
439
+ conjugator, form: 'present_participle') }
440
+ .should eql @participles
441
+ end
442
+ end
443
+ end
444
+
445
+ end
446
+
447
+ describe Treat::Workers::Inflectors::Declensors do
448
+ before do
449
+ @singulars = ["man"]
450
+ @plurals = ["men"]
451
+ end
452
+ context "when #plural is called on a word, or #declense "+
453
+ "is called on a word with option :count set to 'plural'" do
454
+ it "returns the plural form of the word" do
455
+ $workers.inflectors.declensors.each do |declensor|
456
+ @singulars.map { |word| word.plural(declensor) }
457
+ .should eql @plurals
458
+ @singulars.map { |word| word
459
+ .declense(declensor, count: 'plural') }
460
+ .should eql @plurals
461
+ end
462
+ end
463
+ end
464
+ context "when #singular is called on a word, or #declense " +
465
+ "is called on a word with option :count set to 'singular'" do
466
+ it "returns the singular form of the word" do
467
+ $workers.inflectors.declensors.each do |declensor|
468
+ next if declensor == :linguistics
469
+ @plurals.map { |word| word.singular(declensor) }
470
+ .should eql @singulars
471
+ @singulars.map { |word| word
472
+ .declense(declensor, count: 'singular') }
473
+ .should eql @singulars
474
+ end
475
+ end
476
+ end
477
+ end
478
+
479
+ end