engtagger 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +72 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +74 -42
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +169 -192
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: lru_redux
|
@@ -34,6 +34,8 @@ extensions: []
|
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
36
|
- ".gitignore"
|
37
|
+
- ".rubocop.yml"
|
38
|
+
- ".solargraph.yml"
|
37
39
|
- ".yardopts"
|
38
40
|
- Gemfile
|
39
41
|
- LICENSE
|
@@ -48,9 +50,9 @@ files:
|
|
48
50
|
- lib/engtagger/unknown.yml
|
49
51
|
- lib/engtagger/version.rb
|
50
52
|
- lib/engtagger/words.yml
|
51
|
-
- test/test_engtagger.rb
|
52
53
|
homepage: http://github.com/yohasebe/engtagger
|
53
|
-
licenses:
|
54
|
+
licenses:
|
55
|
+
- GPL
|
54
56
|
metadata: {}
|
55
57
|
post_install_message:
|
56
58
|
rdoc_options: []
|
@@ -60,16 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
60
62
|
requirements:
|
61
63
|
- - ">="
|
62
64
|
- !ruby/object:Gem::Version
|
63
|
-
version: '
|
65
|
+
version: '2.6'
|
64
66
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
67
|
requirements:
|
66
68
|
- - ">="
|
67
69
|
- !ruby/object:Gem::Version
|
68
70
|
version: '0'
|
69
71
|
requirements: []
|
70
|
-
rubygems_version: 3.
|
72
|
+
rubygems_version: 3.4.12
|
71
73
|
signing_key:
|
72
74
|
specification_version: 4
|
73
75
|
summary: A probability based, corpus-trained English POS tagger
|
74
|
-
test_files:
|
75
|
-
- test/test_engtagger.rb
|
76
|
+
test_files: []
|
data/test/test_engtagger.rb
DELETED
@@ -1,246 +0,0 @@
|
|
1
|
-
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
-
$LOAD_PATH << $ENGTAGGER_LIB
|
3
|
-
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
4
|
-
require 'engtagger'
|
5
|
-
|
6
|
-
class TestEngTagger < Test::Unit::TestCase
|
7
|
-
|
8
|
-
@@untagged =<<EOD
|
9
|
-
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
10
|
-
EOD
|
11
|
-
|
12
|
-
@@tagged =<<EOD
|
13
|
-
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
14
|
-
EOD
|
15
|
-
|
16
|
-
# Testing class methods
|
17
|
-
|
18
|
-
def setup
|
19
|
-
@tagger = EngTagger.new
|
20
|
-
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
21
|
-
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
22
|
-
if !File.exist?(tagpath) or !File.exist?(wordpath)
|
23
|
-
@tagger.install
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def text_get_ext
|
28
|
-
model = '<cd>[^<]+</cd}>\s*'
|
29
|
-
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_explain_tag
|
33
|
-
assert_equal("noun", EngTagger.explain_tag("nn"))
|
34
|
-
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
35
|
-
end
|
36
|
-
|
37
|
-
# Testing public instance methods
|
38
|
-
|
39
|
-
def test_add_tags
|
40
|
-
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_assign_tag
|
44
|
-
models = []; tests = []
|
45
|
-
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
-
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
-
models.length.times do |i|
|
48
|
-
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
-
end
|
50
|
-
tests = []
|
51
|
-
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
-
models.length.times do |i|
|
53
|
-
result = @tagger.assign_tag(*tests[i])
|
54
|
-
assert(EngTagger.hmm.keys.index(result))
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_clean_text
|
59
|
-
test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
|
60
|
-
model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
61
|
-
assert_equal(model, @tagger.send(:clean_text, test))
|
62
|
-
end
|
63
|
-
|
64
|
-
def test_get_noun_phrases
|
65
|
-
result = @tagger.get_noun_phrases(@@tagged)
|
66
|
-
assert_instance_of(Hash, result)
|
67
|
-
end
|
68
|
-
|
69
|
-
def test_get_nouns
|
70
|
-
result = @tagger.get_nouns(@@tagged)
|
71
|
-
assert_instance_of(Hash, result)
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_get_verbs
|
75
|
-
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
76
|
-
result = @tagger.get_verbs(@@tagged)
|
77
|
-
assert_equal(expected_result, result)
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_get_adverbs
|
81
|
-
expected_result = { "otherwise" => 1 }
|
82
|
-
result = @tagger.get_adverbs(@@tagged)
|
83
|
-
assert_equal(expected_result, result)
|
84
|
-
end
|
85
|
-
|
86
|
-
def test_get_interrogatives
|
87
|
-
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
88
|
-
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
89
|
-
result = @tagger.get_interrogatives(tagged)
|
90
|
-
assert_equal(expected_result, result)
|
91
|
-
end
|
92
|
-
|
93
|
-
def test_get_question_parts
|
94
|
-
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
95
|
-
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
96
|
-
result = @tagger.get_question_parts(tagged)
|
97
|
-
assert_equal(expected_result, result)
|
98
|
-
end
|
99
|
-
|
100
|
-
def test_get_conjunctions
|
101
|
-
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
102
|
-
result = @tagger.get_conjunctions(@@tagged)
|
103
|
-
assert_equal(expected_result, result)
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_get_proper_nouns
|
107
|
-
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
108
|
-
result = @tagger.get_proper_nouns(test)
|
109
|
-
assert_instance_of(Hash, result)
|
110
|
-
end
|
111
|
-
|
112
|
-
def test_get_readable
|
113
|
-
test = "I woke up to the sound of pouring rain."
|
114
|
-
result = @tagger.get_readable(test)
|
115
|
-
assert(String, result)
|
116
|
-
|
117
|
-
test = "I woke up to the sound of pouring rain."
|
118
|
-
result = @tagger.get_readable(test)
|
119
|
-
expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
|
120
|
-
assert_equal(expected_result, result)
|
121
|
-
test = "I woke up with a <bad> word."
|
122
|
-
result = @tagger.get_readable(test)
|
123
|
-
expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
|
124
|
-
assert_equal(expected_result, result)
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
def test_get_sentences
|
129
|
-
result = @tagger.get_sentences(@@untagged)
|
130
|
-
assert_equal(4, result.length)
|
131
|
-
end
|
132
|
-
|
133
|
-
def test_get_words
|
134
|
-
@tagger.conf[:longest_noun_phrase] = 1
|
135
|
-
result1 = @tagger.get_words(@@tagged)
|
136
|
-
@tagger.conf[:longest_noun_phrase] = 10
|
137
|
-
result2 = @tagger.get_words(@@tagged)
|
138
|
-
assert_instance_of(Hash, result1)
|
139
|
-
assert_instance_of(Hash, result2)
|
140
|
-
end
|
141
|
-
|
142
|
-
# Testing private instance methods
|
143
|
-
|
144
|
-
def test_reset
|
145
|
-
@tagger.conf[:current_tag] = 'nn'
|
146
|
-
@tagger.send(:reset)
|
147
|
-
assert_equal('pp', @tagger.conf[:current_tag])
|
148
|
-
end
|
149
|
-
|
150
|
-
|
151
|
-
def test_classify_unknown_word
|
152
|
-
assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
|
153
|
-
assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
|
154
|
-
assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
|
155
|
-
assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
|
156
|
-
assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
|
157
|
-
assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
|
158
|
-
assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
|
159
|
-
assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
|
160
|
-
end
|
161
|
-
|
162
|
-
|
163
|
-
def test_clean_word
|
164
|
-
models = []; tests = []
|
165
|
-
models += ["*NUM*"]
|
166
|
-
models += ["Plays"]
|
167
|
-
models += ["pleadingly"]
|
168
|
-
tests += ["1973.0820", "Plays", "Pleadingly"]
|
169
|
-
models.length.times do |i|
|
170
|
-
assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
def test_get_max_noun_phrases
|
175
|
-
result = @tagger.send(:get_max_noun_phrases, @@tagged)
|
176
|
-
assert_instance_of(Hash, result)
|
177
|
-
end
|
178
|
-
|
179
|
-
def test_get_max_noun_regex
|
180
|
-
assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
|
181
|
-
end
|
182
|
-
|
183
|
-
def test_split_punct
|
184
|
-
models = []; texts = []
|
185
|
-
models << ["`", "test"]; texts << "`test"
|
186
|
-
models << ["``", "test"]; texts << "\"test"
|
187
|
-
models << ["`", "test"]; texts << "'test"
|
188
|
-
models << ["''"]; texts << '"'
|
189
|
-
models << ["test", "'"]; texts << "test' "
|
190
|
-
models << ["-", "test", "-"]; texts << "---test-----"
|
191
|
-
models << ["test", ",", "test"]; texts << "test,test"
|
192
|
-
models << ["123,456"]; texts << "123,456"
|
193
|
-
models << ["test", ":", "test"]; texts << "test:test"
|
194
|
-
models << ["123", ":", "456"]; texts << "123:456"
|
195
|
-
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
196
|
-
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
197
|
-
models << ["test", "#", "test"]; texts << "test#test"
|
198
|
-
models << ["I", "'d", "like"]; texts << "I'd like"
|
199
|
-
models << ["is", "n't", "so"]; texts << "isn't so"
|
200
|
-
models << ["we", "'re", "all"]; texts << "we're all"
|
201
|
-
|
202
|
-
texts.each_with_index do |text, index|
|
203
|
-
assert_equal(models[index], @tagger.send(:split_punct, text))
|
204
|
-
end
|
205
|
-
end
|
206
|
-
|
207
|
-
def test_split_sentences
|
208
|
-
models = []; tests = []
|
209
|
-
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
210
|
-
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
211
|
-
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
212
|
-
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
213
|
-
models.length.times do |i|
|
214
|
-
assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
def test_stem
|
219
|
-
word = "gets"
|
220
|
-
old = @tagger.conf[:stem]
|
221
|
-
@tagger.conf[:stem] = true
|
222
|
-
assert_equal("get", @tagger.stem(word))
|
223
|
-
# the following should not work since we memoize stem method
|
224
|
-
# @tagger.conf[:stem] = false
|
225
|
-
# assert_equal("gets", @tagger.stem(word))
|
226
|
-
@tagger.conf[:stem] = old
|
227
|
-
end
|
228
|
-
|
229
|
-
def test_strip_tags
|
230
|
-
assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
|
231
|
-
end
|
232
|
-
|
233
|
-
def test_valid_text
|
234
|
-
text = nil
|
235
|
-
assert(!@tagger.send(:valid_text, text))
|
236
|
-
text = "this is test text"
|
237
|
-
assert(@tagger.send(:valid_text, text))
|
238
|
-
text = ""
|
239
|
-
assert(!@tagger.send(:valid_text, text))
|
240
|
-
end
|
241
|
-
|
242
|
-
def test_override_default_params
|
243
|
-
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
244
|
-
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
245
|
-
end
|
246
|
-
end
|