engtagger 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +75 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +33 -31
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +38 -60
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -207
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: lru_redux
|
@@ -34,6 +34,8 @@ extensions: []
|
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
36
|
- ".gitignore"
|
37
|
+
- ".rubocop.yml"
|
38
|
+
- ".solargraph.yml"
|
37
39
|
- ".yardopts"
|
38
40
|
- Gemfile
|
39
41
|
- LICENSE
|
@@ -48,9 +50,9 @@ files:
|
|
48
50
|
- lib/engtagger/unknown.yml
|
49
51
|
- lib/engtagger/version.rb
|
50
52
|
- lib/engtagger/words.yml
|
51
|
-
- test/test_engtagger.rb
|
52
53
|
homepage: http://github.com/yohasebe/engtagger
|
53
|
-
licenses:
|
54
|
+
licenses:
|
55
|
+
- GPL
|
54
56
|
metadata: {}
|
55
57
|
post_install_message:
|
56
58
|
rdoc_options: []
|
@@ -60,16 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
60
62
|
requirements:
|
61
63
|
- - ">="
|
62
64
|
- !ruby/object:Gem::Version
|
63
|
-
version: '
|
65
|
+
version: '2.6'
|
64
66
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
67
|
requirements:
|
66
68
|
- - ">="
|
67
69
|
- !ruby/object:Gem::Version
|
68
70
|
version: '0'
|
69
71
|
requirements: []
|
70
|
-
rubygems_version: 3.
|
72
|
+
rubygems_version: 3.4.2
|
71
73
|
signing_key:
|
72
74
|
specification_version: 4
|
73
75
|
summary: A probability based, corpus-trained English POS tagger
|
74
|
-
test_files:
|
75
|
-
- test/test_engtagger.rb
|
76
|
+
test_files: []
|
data/test/test_engtagger.rb
DELETED
@@ -1,246 +0,0 @@
|
|
1
|
-
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
-
$LOAD_PATH << $ENGTAGGER_LIB
|
3
|
-
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
4
|
-
require 'engtagger'
|
5
|
-
|
6
|
-
class TestEngTagger < Test::Unit::TestCase
|
7
|
-
|
8
|
-
@@untagged =<<EOD
|
9
|
-
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
10
|
-
EOD
|
11
|
-
|
12
|
-
@@tagged =<<EOD
|
13
|
-
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
14
|
-
EOD
|
15
|
-
|
16
|
-
# Testing class methods
|
17
|
-
|
18
|
-
def setup
|
19
|
-
@tagger = EngTagger.new
|
20
|
-
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
21
|
-
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
22
|
-
if !File.exist?(tagpath) or !File.exist?(wordpath)
|
23
|
-
@tagger.install
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def text_get_ext
|
28
|
-
model = '<cd>[^<]+</cd}>\s*'
|
29
|
-
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_explain_tag
|
33
|
-
assert_equal("noun", EngTagger.explain_tag("nn"))
|
34
|
-
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
35
|
-
end
|
36
|
-
|
37
|
-
# Testing public instance methods
|
38
|
-
|
39
|
-
def test_add_tags
|
40
|
-
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_assign_tag
|
44
|
-
models = []; tests = []
|
45
|
-
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
-
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
-
models.length.times do |i|
|
48
|
-
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
-
end
|
50
|
-
tests = []
|
51
|
-
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
-
models.length.times do |i|
|
53
|
-
result = @tagger.assign_tag(*tests[i])
|
54
|
-
assert(EngTagger.hmm.keys.index(result))
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_clean_text
|
59
|
-
test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
|
60
|
-
model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
61
|
-
assert_equal(model, @tagger.send(:clean_text, test))
|
62
|
-
end
|
63
|
-
|
64
|
-
def test_get_noun_phrases
|
65
|
-
result = @tagger.get_noun_phrases(@@tagged)
|
66
|
-
assert_instance_of(Hash, result)
|
67
|
-
end
|
68
|
-
|
69
|
-
def test_get_nouns
|
70
|
-
result = @tagger.get_nouns(@@tagged)
|
71
|
-
assert_instance_of(Hash, result)
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_get_verbs
|
75
|
-
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
76
|
-
result = @tagger.get_verbs(@@tagged)
|
77
|
-
assert_equal(expected_result, result)
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_get_adverbs
|
81
|
-
expected_result = { "otherwise" => 1 }
|
82
|
-
result = @tagger.get_adverbs(@@tagged)
|
83
|
-
assert_equal(expected_result, result)
|
84
|
-
end
|
85
|
-
|
86
|
-
def test_get_interrogatives
|
87
|
-
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
88
|
-
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
89
|
-
result = @tagger.get_interrogatives(tagged)
|
90
|
-
assert_equal(expected_result, result)
|
91
|
-
end
|
92
|
-
|
93
|
-
def test_get_question_parts
|
94
|
-
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
95
|
-
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
96
|
-
result = @tagger.get_question_parts(tagged)
|
97
|
-
assert_equal(expected_result, result)
|
98
|
-
end
|
99
|
-
|
100
|
-
def test_get_conjunctions
|
101
|
-
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
102
|
-
result = @tagger.get_conjunctions(@@tagged)
|
103
|
-
assert_equal(expected_result, result)
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_get_proper_nouns
|
107
|
-
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
108
|
-
result = @tagger.get_proper_nouns(test)
|
109
|
-
assert_instance_of(Hash, result)
|
110
|
-
end
|
111
|
-
|
112
|
-
def test_get_readable
|
113
|
-
test = "I woke up to the sound of pouring rain."
|
114
|
-
result = @tagger.get_readable(test)
|
115
|
-
assert(String, result)
|
116
|
-
|
117
|
-
test = "I woke up to the sound of pouring rain."
|
118
|
-
result = @tagger.get_readable(test)
|
119
|
-
expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
|
120
|
-
assert_equal(expected_result, result)
|
121
|
-
test = "I woke up with a <bad> word."
|
122
|
-
result = @tagger.get_readable(test)
|
123
|
-
expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
|
124
|
-
assert_equal(expected_result, result)
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
def test_get_sentences
|
129
|
-
result = @tagger.get_sentences(@@untagged)
|
130
|
-
assert_equal(4, result.length)
|
131
|
-
end
|
132
|
-
|
133
|
-
def test_get_words
|
134
|
-
@tagger.conf[:longest_noun_phrase] = 1
|
135
|
-
result1 = @tagger.get_words(@@tagged)
|
136
|
-
@tagger.conf[:longest_noun_phrase] = 10
|
137
|
-
result2 = @tagger.get_words(@@tagged)
|
138
|
-
assert_instance_of(Hash, result1)
|
139
|
-
assert_instance_of(Hash, result2)
|
140
|
-
end
|
141
|
-
|
142
|
-
# Testing private instance methods
|
143
|
-
|
144
|
-
def test_reset
|
145
|
-
@tagger.conf[:current_tag] = 'nn'
|
146
|
-
@tagger.send(:reset)
|
147
|
-
assert_equal('pp', @tagger.conf[:current_tag])
|
148
|
-
end
|
149
|
-
|
150
|
-
|
151
|
-
def test_classify_unknown_word
|
152
|
-
assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
|
153
|
-
assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
|
154
|
-
assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
|
155
|
-
assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
|
156
|
-
assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
|
157
|
-
assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
|
158
|
-
assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
|
159
|
-
assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
|
160
|
-
end
|
161
|
-
|
162
|
-
|
163
|
-
def test_clean_word
|
164
|
-
models = []; tests = []
|
165
|
-
models += ["*NUM*"]
|
166
|
-
models += ["Plays"]
|
167
|
-
models += ["pleadingly"]
|
168
|
-
tests += ["1973.0820", "Plays", "Pleadingly"]
|
169
|
-
models.length.times do |i|
|
170
|
-
assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
def test_get_max_noun_phrases
|
175
|
-
result = @tagger.send(:get_max_noun_phrases, @@tagged)
|
176
|
-
assert_instance_of(Hash, result)
|
177
|
-
end
|
178
|
-
|
179
|
-
def test_get_max_noun_regex
|
180
|
-
assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
|
181
|
-
end
|
182
|
-
|
183
|
-
def test_split_punct
|
184
|
-
models = []; texts = []
|
185
|
-
models << ["`", "test"]; texts << "`test"
|
186
|
-
models << ["``", "test"]; texts << "\"test"
|
187
|
-
models << ["`", "test"]; texts << "'test"
|
188
|
-
models << ["''"]; texts << '"'
|
189
|
-
models << ["test", "'"]; texts << "test' "
|
190
|
-
models << ["-", "test", "-"]; texts << "---test-----"
|
191
|
-
models << ["test", ",", "test"]; texts << "test,test"
|
192
|
-
models << ["123,456"]; texts << "123,456"
|
193
|
-
models << ["test", ":", "test"]; texts << "test:test"
|
194
|
-
models << ["123", ":", "456"]; texts << "123:456"
|
195
|
-
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
196
|
-
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
197
|
-
models << ["test", "#", "test"]; texts << "test#test"
|
198
|
-
models << ["I", "'d", "like"]; texts << "I'd like"
|
199
|
-
models << ["is", "n't", "so"]; texts << "isn't so"
|
200
|
-
models << ["we", "'re", "all"]; texts << "we're all"
|
201
|
-
|
202
|
-
texts.each_with_index do |text, index|
|
203
|
-
assert_equal(models[index], @tagger.send(:split_punct, text))
|
204
|
-
end
|
205
|
-
end
|
206
|
-
|
207
|
-
def test_split_sentences
|
208
|
-
models = []; tests = []
|
209
|
-
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
210
|
-
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
211
|
-
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
212
|
-
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
213
|
-
models.length.times do |i|
|
214
|
-
assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
def test_stem
|
219
|
-
word = "gets"
|
220
|
-
old = @tagger.conf[:stem]
|
221
|
-
@tagger.conf[:stem] = true
|
222
|
-
assert_equal("get", @tagger.stem(word))
|
223
|
-
# the following should not work since we memoize stem method
|
224
|
-
# @tagger.conf[:stem] = false
|
225
|
-
# assert_equal("gets", @tagger.stem(word))
|
226
|
-
@tagger.conf[:stem] = old
|
227
|
-
end
|
228
|
-
|
229
|
-
def test_strip_tags
|
230
|
-
assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
|
231
|
-
end
|
232
|
-
|
233
|
-
def test_valid_text
|
234
|
-
text = nil
|
235
|
-
assert(!@tagger.send(:valid_text, text))
|
236
|
-
text = "this is test text"
|
237
|
-
assert(@tagger.send(:valid_text, text))
|
238
|
-
text = ""
|
239
|
-
assert(!@tagger.send(:valid_text, text))
|
240
|
-
end
|
241
|
-
|
242
|
-
def test_override_default_params
|
243
|
-
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
244
|
-
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
245
|
-
end
|
246
|
-
end
|