engtagger 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,201 +1,246 @@
1
- # Code Generated by ZenTest v. 3.9.2
2
- # classname: asrt / meth = ratio%
3
- # EngTagger: 0 / 24 = 0.00%
4
-
5
- $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
6
- $LOAD_PATH << $ENGTAGGER_LIB
7
- require 'test/unit' unless defined? $ZENTEST and $ZENTEST
8
- require 'engtagger'
9
-
10
- class TestEngTagger < Test::Unit::TestCase
11
-
12
- @@untagged =<<EOD
13
- Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
14
- EOD
15
-
16
- @@tagged =<<EOD
17
- <nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
18
- EOD
19
-
20
- def setup
21
- @tagger = EngTagger.new
22
- tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
23
- wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
24
- if !File.exists?(tagpath) or !File.exists?(wordpath)
25
- @tagger.install
26
- end
27
- end
28
-
29
- def text_get_ext
30
- model = '<cd>[^<]+</cd}>\s*'
31
- assert_equal(model, EngTagger.get_ext(model, "cd"))
32
- end
33
-
34
- def test_explain_tag
35
- assert_equal("noun", EngTagger.explain_tag("nn"))
36
- assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
37
- end
38
-
39
- def test_add_tags
40
- assert_instance_of(String, @tagger.add_tags(@@untagged))
41
- end
42
-
43
- def test_assign_tag
44
- models = []; tests = []
45
- models += [@tagger.conf[:unknown_word_tag], "sym"]
46
- tests += [["pp","-unknown-"], ["pp", "-sym-"]]
47
- models.length.times do |i|
48
- assert_equal(models[i],@tagger.assign_tag(*tests[i]))
49
- end
50
- tests = []
51
- tests += [["vb","water"], ["nn", "runs"]]
52
- models.length.times do |i|
53
- result = @tagger.assign_tag(*tests[i])
54
- assert(EngTagger.hmm.keys.index(result))
55
- end
56
- end
57
-
58
- def test_classify_unknown_word
59
- assert_equal("*LRB*", @tagger.classify_unknown_word("{"))
60
- assert_equal("*NUM*", @tagger.classify_unknown_word("123.4567"))
61
- assert_equal("*ORD*", @tagger.classify_unknown_word("40th"))
62
- assert_equal("-abr-", @tagger.classify_unknown_word("GT-R"))
63
- assert_equal("-hyp-adj-", @tagger.classify_unknown_word("extremely-high"))
64
- assert_equal("-sym-", @tagger.classify_unknown_word("&&"))
65
- assert_equal("-ing-", @tagger.classify_unknown_word("wikiing"))
66
- assert_equal("-unknown-", @tagger.classify_unknown_word("asefasdf"))
67
- end
68
-
69
- def test_clean_text
70
- test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
71
- model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
72
- assert_equal(model, @tagger.clean_text(test))
73
- end
74
-
75
- def test_clean_word
76
- models = []; tests = []
77
- models += ["*NUM*"]
78
- models += ["Plays"]
79
- models += ["pleadingly"]
80
- tests += ["1973.0820", "Plays", "Pleadingly"]
81
- models.length.times do |i|
82
- assert_equal(models[i], @tagger.clean_word(tests[i]))
83
- end
84
- end
85
-
86
- def test_get_max_noun_phrases
87
- result = @tagger.get_max_noun_phrases(@@tagged)
88
- assert_instance_of(Hash, result)
89
- end
90
-
91
- def test_get_max_noun_regex
92
- assert_instance_of(Regexp, @tagger.get_max_noun_regex)
93
- end
94
-
95
- def test_get_noun_phrases
96
- result = @tagger.get_noun_phrases(@@tagged)
97
- assert_instance_of(Hash, result)
98
- end
99
-
100
- def test_get_nouns
101
- result = @tagger.get_nouns(@@tagged)
102
- assert_instance_of(Hash, result)
103
- end
104
-
105
- def test_get_proper_nouns
106
- test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
107
- result = @tagger.get_proper_nouns(test)
108
- assert_instance_of(Hash, result)
109
- end
110
-
111
- def test_get_readable
112
- test = "I woke up to the sound of pouring rain."
113
- result = @tagger.get_readable(test)
114
- assert(String, result)
115
- end
116
-
117
- def test_get_sentences
118
- result = @tagger.get_sentences(@@untagged)
119
- assert_equal(4, result.length)
120
- end
121
-
122
- def test_get_words
123
- @tagger.conf[:longest_noun_phrase] = 1
124
- result1 = @tagger.get_words(@@tagged)
125
- @tagger.conf[:longest_noun_phrase] = 10
126
- result2 = @tagger.get_words(@@tagged)
127
- assert_instance_of(Hash, result1)
128
- assert_instance_of(Hash, result2)
129
- end
130
-
131
- def test_reset
132
- @tagger.conf[:current_tag] = 'nn'
133
- @tagger.reset
134
- assert_equal('pp', @tagger.conf[:current_tag])
135
- end
136
-
137
- def test_split_punct
138
- models = []; texts = []
139
- models << ["`", "test"]; texts << "`test"
140
- models << ["``", "test"]; texts << "\"test"
141
- models << ["`", "test"]; texts << "'test"
142
- models << ["''"]; texts << '"'
143
- models << ["test", "'"]; texts << "test' "
144
- models << ["-", "test", "-"]; texts << "---test-----"
145
- models << ["test", ",", "test"]; texts << "test,test"
146
- models << ["123,456"]; texts << "123,456"
147
- models << ["test", ":"]; texts << "test:"
148
- models << ["test1", "...", "test2"]; texts << "test1...test2"
149
- models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
150
- models << ["test", "#", "test"]; texts << "test#test"
151
- models << ["I", "'d", "like"]; texts << "I'd like"
152
- models << ["is", "n't", "so"]; texts << "isn't so"
153
- models << ["we", "'re", "all"]; texts << "we're all"
154
-
155
- texts.each_with_index do |text, index|
156
- assert_equal(models[index], @tagger.split_punct(text))
157
- end
158
- end
159
-
160
- def test_split_sentences
161
- models = []; tests = []
162
- models << ["He", "is", "a", "u.s.", "army", "officer", "."]
163
- tests << ["He", "is", "a", "u.s.", "army", "officer."]
164
- models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
165
- tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
166
- models.length.times do |i|
167
- assert_equal(models[i], @tagger.split_sentences(tests[i]))
168
- end
169
- end
170
-
171
- def test_stem
172
- word = "gets"
173
- old = @tagger.conf[:stem]
174
- @tagger.conf[:stem] = true
175
- assert_equal("get", @tagger.stem(word))
176
- # the following should not work since we memoize stem method
177
- # @tagger.conf[:stem] = false
178
- # assert_equal("gets", @tagger.stem(word))
179
- @tagger.conf[:stem] = old
180
- end
181
-
182
- def test_strip_tags
183
- assert_instance_of(String, @tagger.strip_tags(@@tagged))
184
- end
185
-
186
- def test_valid_text
187
- text = nil
188
- assert(!@tagger.valid_text(text))
189
- text = "this is test text"
190
- assert(@tagger.valid_text(text))
191
- text = ""
192
- assert(!@tagger.valid_text(text))
193
- end
194
-
195
- def test_override_default_params
196
- @tagger = EngTagger.new(:longest_noun_phrase => 3)
197
- assert_equal 3, @tagger.conf[:longest_noun_phrase]
198
- end
199
- end
200
-
201
- # Number of errors detected: 24
1
+ $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
2
+ $LOAD_PATH << $ENGTAGGER_LIB
3
+ require 'test/unit' unless defined? $ZENTEST and $ZENTEST
4
+ require 'engtagger'
5
+
6
+ class TestEngTagger < Test::Unit::TestCase
7
+
8
+ @@untagged =<<EOD
9
+ Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
10
+ EOD
11
+
12
+ @@tagged =<<EOD
13
+ <nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
14
+ EOD
15
+
16
+ # Testing class methods
17
+
18
+ def setup
19
+ @tagger = EngTagger.new
20
+ tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
21
+ wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
22
+ if !File.exist?(tagpath) or !File.exist?(wordpath)
23
+ @tagger.install
24
+ end
25
+ end
26
+
27
+ def text_get_ext
28
+ model = '<cd>[^<]+</cd}>\s*'
29
+ assert_equal(model, EngTagger.get_ext(model, "cd"))
30
+ end
31
+
32
+ def test_explain_tag
33
+ assert_equal("noun", EngTagger.explain_tag("nn"))
34
+ assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
35
+ end
36
+
37
+ # Testing public instance methods
38
+
39
+ def test_add_tags
40
+ assert_instance_of(String, @tagger.add_tags(@@untagged))
41
+ end
42
+
43
+ def test_assign_tag
44
+ models = []; tests = []
45
+ models += [@tagger.conf[:unknown_word_tag], "sym"]
46
+ tests += [["pp","-unknown-"], ["pp", "-sym-"]]
47
+ models.length.times do |i|
48
+ assert_equal(models[i],@tagger.assign_tag(*tests[i]))
49
+ end
50
+ tests = []
51
+ tests += [["vb","water"], ["nn", "runs"]]
52
+ models.length.times do |i|
53
+ result = @tagger.assign_tag(*tests[i])
54
+ assert(EngTagger.hmm.keys.index(result))
55
+ end
56
+ end
57
+
58
+ def test_clean_text
59
+ test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
60
+ model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
61
+ assert_equal(model, @tagger.send(:clean_text, test))
62
+ end
63
+
64
+ def test_get_noun_phrases
65
+ result = @tagger.get_noun_phrases(@@tagged)
66
+ assert_instance_of(Hash, result)
67
+ end
68
+
69
+ def test_get_nouns
70
+ result = @tagger.get_nouns(@@tagged)
71
+ assert_instance_of(Hash, result)
72
+ end
73
+
74
+ def test_get_verbs
75
+ expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
76
+ result = @tagger.get_verbs(@@tagged)
77
+ assert_equal(expected_result, result)
78
+ end
79
+
80
+ def test_get_adverbs
81
+ expected_result = { "otherwise" => 1 }
82
+ result = @tagger.get_adverbs(@@tagged)
83
+ assert_equal(expected_result, result)
84
+ end
85
+
86
+ def test_get_interrogatives
87
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
88
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
89
+ result = @tagger.get_interrogatives(tagged)
90
+ assert_equal(expected_result, result)
91
+ end
92
+
93
+ def test_get_question_parts
94
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
95
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
96
+ result = @tagger.get_question_parts(tagged)
97
+ assert_equal(expected_result, result)
98
+ end
99
+
100
+ def test_get_conjunctions
101
+ expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
102
+ result = @tagger.get_conjunctions(@@tagged)
103
+ assert_equal(expected_result, result)
104
+ end
105
+
106
+ def test_get_proper_nouns
107
+ test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
108
+ result = @tagger.get_proper_nouns(test)
109
+ assert_instance_of(Hash, result)
110
+ end
111
+
112
+ def test_get_readable
113
+ test = "I woke up to the sound of pouring rain."
114
+ result = @tagger.get_readable(test)
115
+ assert(String, result)
116
+
117
+ test = "I woke up to the sound of pouring rain."
118
+ result = @tagger.get_readable(test)
119
+ expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
120
+ assert_equal(expected_result, result)
121
+ test = "I woke up with a <bad> word."
122
+ result = @tagger.get_readable(test)
123
+ expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
124
+ assert_equal(expected_result, result)
125
+ end
126
+
127
+
128
+ def test_get_sentences
129
+ result = @tagger.get_sentences(@@untagged)
130
+ assert_equal(4, result.length)
131
+ end
132
+
133
+ def test_get_words
134
+ @tagger.conf[:longest_noun_phrase] = 1
135
+ result1 = @tagger.get_words(@@tagged)
136
+ @tagger.conf[:longest_noun_phrase] = 10
137
+ result2 = @tagger.get_words(@@tagged)
138
+ assert_instance_of(Hash, result1)
139
+ assert_instance_of(Hash, result2)
140
+ end
141
+
142
+ # Testing private instance methods
143
+
144
+ def test_reset
145
+ @tagger.conf[:current_tag] = 'nn'
146
+ @tagger.send(:reset)
147
+ assert_equal('pp', @tagger.conf[:current_tag])
148
+ end
149
+
150
+
151
+ def test_classify_unknown_word
152
+ assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
153
+ assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
154
+ assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
155
+ assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
156
+ assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
157
+ assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
158
+ assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
159
+ assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
160
+ end
161
+
162
+
163
+ def test_clean_word
164
+ models = []; tests = []
165
+ models += ["*NUM*"]
166
+ models += ["Plays"]
167
+ models += ["pleadingly"]
168
+ tests += ["1973.0820", "Plays", "Pleadingly"]
169
+ models.length.times do |i|
170
+ assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
171
+ end
172
+ end
173
+
174
+ def test_get_max_noun_phrases
175
+ result = @tagger.send(:get_max_noun_phrases, @@tagged)
176
+ assert_instance_of(Hash, result)
177
+ end
178
+
179
+ def test_get_max_noun_regex
180
+ assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
181
+ end
182
+
183
+ def test_split_punct
184
+ models = []; texts = []
185
+ models << ["`", "test"]; texts << "`test"
186
+ models << ["``", "test"]; texts << "\"test"
187
+ models << ["`", "test"]; texts << "'test"
188
+ models << ["''"]; texts << '"'
189
+ models << ["test", "'"]; texts << "test' "
190
+ models << ["-", "test", "-"]; texts << "---test-----"
191
+ models << ["test", ",", "test"]; texts << "test,test"
192
+ models << ["123,456"]; texts << "123,456"
193
+ models << ["test", ":", "test"]; texts << "test:test"
194
+ models << ["123", ":", "456"]; texts << "123:456"
195
+ models << ["test1", "...", "test2"]; texts << "test1...test2"
196
+ models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
197
+ models << ["test", "#", "test"]; texts << "test#test"
198
+ models << ["I", "'d", "like"]; texts << "I'd like"
199
+ models << ["is", "n't", "so"]; texts << "isn't so"
200
+ models << ["we", "'re", "all"]; texts << "we're all"
201
+
202
+ texts.each_with_index do |text, index|
203
+ assert_equal(models[index], @tagger.send(:split_punct, text))
204
+ end
205
+ end
206
+
207
+ def test_split_sentences
208
+ models = []; tests = []
209
+ models << ["He", "is", "a", "u.s.", "army", "officer", "."]
210
+ tests << ["He", "is", "a", "u.s.", "army", "officer."]
211
+ models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
212
+ tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
213
+ models.length.times do |i|
214
+ assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
215
+ end
216
+ end
217
+
218
+ def test_stem
219
+ word = "gets"
220
+ old = @tagger.conf[:stem]
221
+ @tagger.conf[:stem] = true
222
+ assert_equal("get", @tagger.stem(word))
223
+ # the following should not work since we memoize stem method
224
+ # @tagger.conf[:stem] = false
225
+ # assert_equal("gets", @tagger.stem(word))
226
+ @tagger.conf[:stem] = old
227
+ end
228
+
229
+ def test_strip_tags
230
+ assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
231
+ end
232
+
233
+ def test_valid_text
234
+ text = nil
235
+ assert(!@tagger.send(:valid_text, text))
236
+ text = "this is test text"
237
+ assert(@tagger.send(:valid_text, text))
238
+ text = ""
239
+ assert(!@tagger.send(:valid_text, text))
240
+ end
241
+
242
+ def test_override_default_params
243
+ @tagger = EngTagger.new(:longest_noun_phrase => 3)
244
+ assert_equal 3, @tagger.conf[:longest_noun_phrase]
245
+ end
246
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-20 00:00:00.000000000 Z
11
+ date: 2022-06-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
14
14
  tagger that assigns POS tags to English text based on a lookup dictionary and a
@@ -20,6 +20,7 @@ extensions: []
20
20
  extra_rdoc_files: []
21
21
  files:
22
22
  - ".gitignore"
23
+ - ".yardopts"
23
24
  - Gemfile
24
25
  - LICENSE
25
26
  - README.md
@@ -37,7 +38,7 @@ files:
37
38
  homepage: http://github.com/yohasebe/engtagger
38
39
  licenses: []
39
40
  metadata: {}
40
- post_install_message:
41
+ post_install_message:
41
42
  rdoc_options: []
42
43
  require_paths:
43
44
  - lib
@@ -52,9 +53,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  - !ruby/object:Gem::Version
53
54
  version: '0'
54
55
  requirements: []
55
- rubyforge_project:
56
- rubygems_version: 2.2.2
57
- signing_key:
56
+ rubygems_version: 3.3.3
57
+ signing_key:
58
58
  specification_version: 4
59
59
  summary: A probability based, corpus-trained English POS tagger
60
60
  test_files: