engtagger 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-05 00:00:00.000000000 Z
11
+ date: 2023-01-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lru_redux
@@ -34,6 +34,8 @@ extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
36
  - ".gitignore"
37
+ - ".rubocop.yml"
38
+ - ".solargraph.yml"
37
39
  - ".yardopts"
38
40
  - Gemfile
39
41
  - LICENSE
@@ -48,9 +50,9 @@ files:
48
50
  - lib/engtagger/unknown.yml
49
51
  - lib/engtagger/version.rb
50
52
  - lib/engtagger/words.yml
51
- - test/test_engtagger.rb
52
53
  homepage: http://github.com/yohasebe/engtagger
53
- licenses: []
54
+ licenses:
55
+ - GPL
54
56
  metadata: {}
55
57
  post_install_message:
56
58
  rdoc_options: []
@@ -60,16 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
60
62
  requirements:
61
63
  - - ">="
62
64
  - !ruby/object:Gem::Version
63
- version: '0'
65
+ version: '2.6'
64
66
  required_rubygems_version: !ruby/object:Gem::Requirement
65
67
  requirements:
66
68
  - - ">="
67
69
  - !ruby/object:Gem::Version
68
70
  version: '0'
69
71
  requirements: []
70
- rubygems_version: 3.3.3
72
+ rubygems_version: 3.4.2
71
73
  signing_key:
72
74
  specification_version: 4
73
75
  summary: A probability based, corpus-trained English POS tagger
74
- test_files:
75
- - test/test_engtagger.rb
76
+ test_files: []
@@ -1,246 +0,0 @@
1
- $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
2
- $LOAD_PATH << $ENGTAGGER_LIB
3
- require 'test/unit' unless defined? $ZENTEST and $ZENTEST
4
- require 'engtagger'
5
-
6
- class TestEngTagger < Test::Unit::TestCase
7
-
8
- @@untagged =<<EOD
9
- Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
10
- EOD
11
-
12
- @@tagged =<<EOD
13
- <nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
14
- EOD
15
-
16
- # Testing class methods
17
-
18
- def setup
19
- @tagger = EngTagger.new
20
- tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
21
- wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
22
- if !File.exist?(tagpath) or !File.exist?(wordpath)
23
- @tagger.install
24
- end
25
- end
26
-
27
- def text_get_ext
28
- model = '<cd>[^<]+</cd}>\s*'
29
- assert_equal(model, EngTagger.get_ext(model, "cd"))
30
- end
31
-
32
- def test_explain_tag
33
- assert_equal("noun", EngTagger.explain_tag("nn"))
34
- assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
35
- end
36
-
37
- # Testing public instance methods
38
-
39
- def test_add_tags
40
- assert_instance_of(String, @tagger.add_tags(@@untagged))
41
- end
42
-
43
- def test_assign_tag
44
- models = []; tests = []
45
- models += [@tagger.conf[:unknown_word_tag], "sym"]
46
- tests += [["pp","-unknown-"], ["pp", "-sym-"]]
47
- models.length.times do |i|
48
- assert_equal(models[i],@tagger.assign_tag(*tests[i]))
49
- end
50
- tests = []
51
- tests += [["vb","water"], ["nn", "runs"]]
52
- models.length.times do |i|
53
- result = @tagger.assign_tag(*tests[i])
54
- assert(EngTagger.hmm.keys.index(result))
55
- end
56
- end
57
-
58
- def test_clean_text
59
- test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
60
- model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
61
- assert_equal(model, @tagger.send(:clean_text, test))
62
- end
63
-
64
- def test_get_noun_phrases
65
- result = @tagger.get_noun_phrases(@@tagged)
66
- assert_instance_of(Hash, result)
67
- end
68
-
69
- def test_get_nouns
70
- result = @tagger.get_nouns(@@tagged)
71
- assert_instance_of(Hash, result)
72
- end
73
-
74
- def test_get_verbs
75
- expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
76
- result = @tagger.get_verbs(@@tagged)
77
- assert_equal(expected_result, result)
78
- end
79
-
80
- def test_get_adverbs
81
- expected_result = { "otherwise" => 1 }
82
- result = @tagger.get_adverbs(@@tagged)
83
- assert_equal(expected_result, result)
84
- end
85
-
86
- def test_get_interrogatives
87
- tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
88
- expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
89
- result = @tagger.get_interrogatives(tagged)
90
- assert_equal(expected_result, result)
91
- end
92
-
93
- def test_get_question_parts
94
- tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
95
- expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
96
- result = @tagger.get_question_parts(tagged)
97
- assert_equal(expected_result, result)
98
- end
99
-
100
- def test_get_conjunctions
101
- expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
102
- result = @tagger.get_conjunctions(@@tagged)
103
- assert_equal(expected_result, result)
104
- end
105
-
106
- def test_get_proper_nouns
107
- test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
108
- result = @tagger.get_proper_nouns(test)
109
- assert_instance_of(Hash, result)
110
- end
111
-
112
- def test_get_readable
113
- test = "I woke up to the sound of pouring rain."
114
- result = @tagger.get_readable(test)
115
- assert(String, result)
116
-
117
- test = "I woke up to the sound of pouring rain."
118
- result = @tagger.get_readable(test)
119
- expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
120
- assert_equal(expected_result, result)
121
- test = "I woke up with a <bad> word."
122
- result = @tagger.get_readable(test)
123
- expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
124
- assert_equal(expected_result, result)
125
- end
126
-
127
-
128
- def test_get_sentences
129
- result = @tagger.get_sentences(@@untagged)
130
- assert_equal(4, result.length)
131
- end
132
-
133
- def test_get_words
134
- @tagger.conf[:longest_noun_phrase] = 1
135
- result1 = @tagger.get_words(@@tagged)
136
- @tagger.conf[:longest_noun_phrase] = 10
137
- result2 = @tagger.get_words(@@tagged)
138
- assert_instance_of(Hash, result1)
139
- assert_instance_of(Hash, result2)
140
- end
141
-
142
- # Testing private instance methods
143
-
144
- def test_reset
145
- @tagger.conf[:current_tag] = 'nn'
146
- @tagger.send(:reset)
147
- assert_equal('pp', @tagger.conf[:current_tag])
148
- end
149
-
150
-
151
- def test_classify_unknown_word
152
- assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
153
- assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
154
- assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
155
- assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
156
- assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
157
- assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
158
- assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
159
- assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
160
- end
161
-
162
-
163
- def test_clean_word
164
- models = []; tests = []
165
- models += ["*NUM*"]
166
- models += ["Plays"]
167
- models += ["pleadingly"]
168
- tests += ["1973.0820", "Plays", "Pleadingly"]
169
- models.length.times do |i|
170
- assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
171
- end
172
- end
173
-
174
- def test_get_max_noun_phrases
175
- result = @tagger.send(:get_max_noun_phrases, @@tagged)
176
- assert_instance_of(Hash, result)
177
- end
178
-
179
- def test_get_max_noun_regex
180
- assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
181
- end
182
-
183
- def test_split_punct
184
- models = []; texts = []
185
- models << ["`", "test"]; texts << "`test"
186
- models << ["``", "test"]; texts << "\"test"
187
- models << ["`", "test"]; texts << "'test"
188
- models << ["''"]; texts << '"'
189
- models << ["test", "'"]; texts << "test' "
190
- models << ["-", "test", "-"]; texts << "---test-----"
191
- models << ["test", ",", "test"]; texts << "test,test"
192
- models << ["123,456"]; texts << "123,456"
193
- models << ["test", ":", "test"]; texts << "test:test"
194
- models << ["123", ":", "456"]; texts << "123:456"
195
- models << ["test1", "...", "test2"]; texts << "test1...test2"
196
- models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
197
- models << ["test", "#", "test"]; texts << "test#test"
198
- models << ["I", "'d", "like"]; texts << "I'd like"
199
- models << ["is", "n't", "so"]; texts << "isn't so"
200
- models << ["we", "'re", "all"]; texts << "we're all"
201
-
202
- texts.each_with_index do |text, index|
203
- assert_equal(models[index], @tagger.send(:split_punct, text))
204
- end
205
- end
206
-
207
- def test_split_sentences
208
- models = []; tests = []
209
- models << ["He", "is", "a", "u.s.", "army", "officer", "."]
210
- tests << ["He", "is", "a", "u.s.", "army", "officer."]
211
- models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
212
- tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
213
- models.length.times do |i|
214
- assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
215
- end
216
- end
217
-
218
- def test_stem
219
- word = "gets"
220
- old = @tagger.conf[:stem]
221
- @tagger.conf[:stem] = true
222
- assert_equal("get", @tagger.stem(word))
223
- # the following should not work since we memoize stem method
224
- # @tagger.conf[:stem] = false
225
- # assert_equal("gets", @tagger.stem(word))
226
- @tagger.conf[:stem] = old
227
- end
228
-
229
- def test_strip_tags
230
- assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
231
- end
232
-
233
- def test_valid_text
234
- text = nil
235
- assert(!@tagger.send(:valid_text, text))
236
- text = "this is test text"
237
- assert(@tagger.send(:valid_text, text))
238
- text = ""
239
- assert(!@tagger.send(:valid_text, text))
240
- end
241
-
242
- def test_override_default_params
243
- @tagger = EngTagger.new(:longest_noun_phrase => 3)
244
- assert_equal 3, @tagger.conf[:longest_noun_phrase]
245
- end
246
- end