engtagger 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.yardopts +5 -0
- data/Gemfile +1 -2
- data/README.md +19 -25
- data/engtagger.gemspec +4 -4
- data/lib/engtagger/porter.rb +12 -12
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +2 -2
- data/lib/engtagger.rb +341 -290
- data/test/test_engtagger.rb +246 -201
- metadata +7 -7
data/test/test_engtagger.rb
CHANGED
@@ -1,201 +1,246 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
@@
|
13
|
-
Lisa Raines
|
14
|
-
EOD
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
tagpath
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
def test_add_tags
|
40
|
-
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_assign_tag
|
44
|
-
models = []; tests = []
|
45
|
-
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
-
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
-
models.length.times do |i|
|
48
|
-
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
-
end
|
50
|
-
tests = []
|
51
|
-
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
-
models.length.times do |i|
|
53
|
-
result = @tagger.assign_tag(*tests[i])
|
54
|
-
assert(EngTagger.hmm.keys.index(result))
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
assert_equal(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
result = @tagger.
|
97
|
-
|
98
|
-
end
|
99
|
-
|
100
|
-
def
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
result = @tagger.
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
models
|
165
|
-
|
166
|
-
models
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
1
|
+
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
+
$LOAD_PATH << $ENGTAGGER_LIB
|
3
|
+
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
4
|
+
require 'engtagger'
|
5
|
+
|
6
|
+
class TestEngTagger < Test::Unit::TestCase
|
7
|
+
|
8
|
+
@@untagged =<<EOD
|
9
|
+
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
10
|
+
EOD
|
11
|
+
|
12
|
+
@@tagged =<<EOD
|
13
|
+
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
14
|
+
EOD
|
15
|
+
|
16
|
+
# Testing class methods
|
17
|
+
|
18
|
+
def setup
|
19
|
+
@tagger = EngTagger.new
|
20
|
+
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
21
|
+
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
22
|
+
if !File.exist?(tagpath) or !File.exist?(wordpath)
|
23
|
+
@tagger.install
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def text_get_ext
|
28
|
+
model = '<cd>[^<]+</cd}>\s*'
|
29
|
+
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_explain_tag
|
33
|
+
assert_equal("noun", EngTagger.explain_tag("nn"))
|
34
|
+
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Testing public instance methods
|
38
|
+
|
39
|
+
def test_add_tags
|
40
|
+
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_assign_tag
|
44
|
+
models = []; tests = []
|
45
|
+
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
+
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
+
models.length.times do |i|
|
48
|
+
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
+
end
|
50
|
+
tests = []
|
51
|
+
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
+
models.length.times do |i|
|
53
|
+
result = @tagger.assign_tag(*tests[i])
|
54
|
+
assert(EngTagger.hmm.keys.index(result))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_clean_text
|
59
|
+
test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
|
60
|
+
model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
61
|
+
assert_equal(model, @tagger.send(:clean_text, test))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_get_noun_phrases
|
65
|
+
result = @tagger.get_noun_phrases(@@tagged)
|
66
|
+
assert_instance_of(Hash, result)
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_get_nouns
|
70
|
+
result = @tagger.get_nouns(@@tagged)
|
71
|
+
assert_instance_of(Hash, result)
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_get_verbs
|
75
|
+
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
76
|
+
result = @tagger.get_verbs(@@tagged)
|
77
|
+
assert_equal(expected_result, result)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_get_adverbs
|
81
|
+
expected_result = { "otherwise" => 1 }
|
82
|
+
result = @tagger.get_adverbs(@@tagged)
|
83
|
+
assert_equal(expected_result, result)
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_get_interrogatives
|
87
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
88
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
89
|
+
result = @tagger.get_interrogatives(tagged)
|
90
|
+
assert_equal(expected_result, result)
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_get_question_parts
|
94
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
95
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
96
|
+
result = @tagger.get_question_parts(tagged)
|
97
|
+
assert_equal(expected_result, result)
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_get_conjunctions
|
101
|
+
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
102
|
+
result = @tagger.get_conjunctions(@@tagged)
|
103
|
+
assert_equal(expected_result, result)
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_get_proper_nouns
|
107
|
+
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
108
|
+
result = @tagger.get_proper_nouns(test)
|
109
|
+
assert_instance_of(Hash, result)
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_get_readable
|
113
|
+
test = "I woke up to the sound of pouring rain."
|
114
|
+
result = @tagger.get_readable(test)
|
115
|
+
assert(String, result)
|
116
|
+
|
117
|
+
test = "I woke up to the sound of pouring rain."
|
118
|
+
result = @tagger.get_readable(test)
|
119
|
+
expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
|
120
|
+
assert_equal(expected_result, result)
|
121
|
+
test = "I woke up with a <bad> word."
|
122
|
+
result = @tagger.get_readable(test)
|
123
|
+
expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
|
124
|
+
assert_equal(expected_result, result)
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def test_get_sentences
|
129
|
+
result = @tagger.get_sentences(@@untagged)
|
130
|
+
assert_equal(4, result.length)
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_get_words
|
134
|
+
@tagger.conf[:longest_noun_phrase] = 1
|
135
|
+
result1 = @tagger.get_words(@@tagged)
|
136
|
+
@tagger.conf[:longest_noun_phrase] = 10
|
137
|
+
result2 = @tagger.get_words(@@tagged)
|
138
|
+
assert_instance_of(Hash, result1)
|
139
|
+
assert_instance_of(Hash, result2)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Testing private instance methods
|
143
|
+
|
144
|
+
def test_reset
|
145
|
+
@tagger.conf[:current_tag] = 'nn'
|
146
|
+
@tagger.send(:reset)
|
147
|
+
assert_equal('pp', @tagger.conf[:current_tag])
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
def test_classify_unknown_word
|
152
|
+
assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
|
153
|
+
assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
|
154
|
+
assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
|
155
|
+
assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
|
156
|
+
assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
|
157
|
+
assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
|
158
|
+
assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
|
159
|
+
assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def test_clean_word
|
164
|
+
models = []; tests = []
|
165
|
+
models += ["*NUM*"]
|
166
|
+
models += ["Plays"]
|
167
|
+
models += ["pleadingly"]
|
168
|
+
tests += ["1973.0820", "Plays", "Pleadingly"]
|
169
|
+
models.length.times do |i|
|
170
|
+
assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_get_max_noun_phrases
|
175
|
+
result = @tagger.send(:get_max_noun_phrases, @@tagged)
|
176
|
+
assert_instance_of(Hash, result)
|
177
|
+
end
|
178
|
+
|
179
|
+
def test_get_max_noun_regex
|
180
|
+
assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
|
181
|
+
end
|
182
|
+
|
183
|
+
def test_split_punct
|
184
|
+
models = []; texts = []
|
185
|
+
models << ["`", "test"]; texts << "`test"
|
186
|
+
models << ["``", "test"]; texts << "\"test"
|
187
|
+
models << ["`", "test"]; texts << "'test"
|
188
|
+
models << ["''"]; texts << '"'
|
189
|
+
models << ["test", "'"]; texts << "test' "
|
190
|
+
models << ["-", "test", "-"]; texts << "---test-----"
|
191
|
+
models << ["test", ",", "test"]; texts << "test,test"
|
192
|
+
models << ["123,456"]; texts << "123,456"
|
193
|
+
models << ["test", ":", "test"]; texts << "test:test"
|
194
|
+
models << ["123", ":", "456"]; texts << "123:456"
|
195
|
+
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
196
|
+
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
197
|
+
models << ["test", "#", "test"]; texts << "test#test"
|
198
|
+
models << ["I", "'d", "like"]; texts << "I'd like"
|
199
|
+
models << ["is", "n't", "so"]; texts << "isn't so"
|
200
|
+
models << ["we", "'re", "all"]; texts << "we're all"
|
201
|
+
|
202
|
+
texts.each_with_index do |text, index|
|
203
|
+
assert_equal(models[index], @tagger.send(:split_punct, text))
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_split_sentences
|
208
|
+
models = []; tests = []
|
209
|
+
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
210
|
+
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
211
|
+
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
212
|
+
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
213
|
+
models.length.times do |i|
|
214
|
+
assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def test_stem
|
219
|
+
word = "gets"
|
220
|
+
old = @tagger.conf[:stem]
|
221
|
+
@tagger.conf[:stem] = true
|
222
|
+
assert_equal("get", @tagger.stem(word))
|
223
|
+
# the following should not work since we memoize stem method
|
224
|
+
# @tagger.conf[:stem] = false
|
225
|
+
# assert_equal("gets", @tagger.stem(word))
|
226
|
+
@tagger.conf[:stem] = old
|
227
|
+
end
|
228
|
+
|
229
|
+
def test_strip_tags
|
230
|
+
assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
|
231
|
+
end
|
232
|
+
|
233
|
+
def test_valid_text
|
234
|
+
text = nil
|
235
|
+
assert(!@tagger.send(:valid_text, text))
|
236
|
+
text = "this is test text"
|
237
|
+
assert(@tagger.send(:valid_text, text))
|
238
|
+
text = ""
|
239
|
+
assert(!@tagger.send(:valid_text, text))
|
240
|
+
end
|
241
|
+
|
242
|
+
def test_override_default_params
|
243
|
+
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
244
|
+
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
245
|
+
end
|
246
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
14
14
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
@@ -20,6 +20,7 @@ extensions: []
|
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- ".gitignore"
|
23
|
+
- ".yardopts"
|
23
24
|
- Gemfile
|
24
25
|
- LICENSE
|
25
26
|
- README.md
|
@@ -37,7 +38,7 @@ files:
|
|
37
38
|
homepage: http://github.com/yohasebe/engtagger
|
38
39
|
licenses: []
|
39
40
|
metadata: {}
|
40
|
-
post_install_message:
|
41
|
+
post_install_message:
|
41
42
|
rdoc_options: []
|
42
43
|
require_paths:
|
43
44
|
- lib
|
@@ -52,9 +53,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
53
|
- !ruby/object:Gem::Version
|
53
54
|
version: '0'
|
54
55
|
requirements: []
|
55
|
-
|
56
|
-
|
57
|
-
signing_key:
|
56
|
+
rubygems_version: 3.3.3
|
57
|
+
signing_key:
|
58
58
|
specification_version: 4
|
59
59
|
summary: A probability based, corpus-trained English POS tagger
|
60
60
|
test_files:
|