engtagger 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-05 00:00:00.000000000 Z
11
+ date: 2024-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lru_redux
@@ -34,6 +34,8 @@ extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
36
  - ".gitignore"
37
+ - ".rubocop.yml"
38
+ - ".solargraph.yml"
37
39
  - ".yardopts"
38
40
  - Gemfile
39
41
  - LICENSE
@@ -48,9 +50,9 @@ files:
48
50
  - lib/engtagger/unknown.yml
49
51
  - lib/engtagger/version.rb
50
52
  - lib/engtagger/words.yml
51
- - test/test_engtagger.rb
52
53
  homepage: http://github.com/yohasebe/engtagger
53
- licenses: []
54
+ licenses:
55
+ - GPL
54
56
  metadata: {}
55
57
  post_install_message:
56
58
  rdoc_options: []
@@ -60,16 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
60
62
  requirements:
61
63
  - - ">="
62
64
  - !ruby/object:Gem::Version
63
- version: '0'
65
+ version: '2.6'
64
66
  required_rubygems_version: !ruby/object:Gem::Requirement
65
67
  requirements:
66
68
  - - ">="
67
69
  - !ruby/object:Gem::Version
68
70
  version: '0'
69
71
  requirements: []
70
- rubygems_version: 3.3.3
72
+ rubygems_version: 3.4.12
71
73
  signing_key:
72
74
  specification_version: 4
73
75
  summary: A probability based, corpus-trained English POS tagger
74
- test_files:
75
- - test/test_engtagger.rb
76
+ test_files: []
@@ -1,246 +0,0 @@
1
- $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
2
- $LOAD_PATH << $ENGTAGGER_LIB
3
- require 'test/unit' unless defined? $ZENTEST and $ZENTEST
4
- require 'engtagger'
5
-
6
- class TestEngTagger < Test::Unit::TestCase
7
-
8
- @@untagged =<<EOD
9
- Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
10
- EOD
11
-
12
- @@tagged =<<EOD
13
- <nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
14
- EOD
15
-
16
- # Testing class methods
17
-
18
- def setup
19
- @tagger = EngTagger.new
20
- tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
21
- wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
22
- if !File.exist?(tagpath) or !File.exist?(wordpath)
23
- @tagger.install
24
- end
25
- end
26
-
27
- def text_get_ext
28
- model = '<cd>[^<]+</cd}>\s*'
29
- assert_equal(model, EngTagger.get_ext(model, "cd"))
30
- end
31
-
32
- def test_explain_tag
33
- assert_equal("noun", EngTagger.explain_tag("nn"))
34
- assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
35
- end
36
-
37
- # Testing public instance methods
38
-
39
- def test_add_tags
40
- assert_instance_of(String, @tagger.add_tags(@@untagged))
41
- end
42
-
43
- def test_assign_tag
44
- models = []; tests = []
45
- models += [@tagger.conf[:unknown_word_tag], "sym"]
46
- tests += [["pp","-unknown-"], ["pp", "-sym-"]]
47
- models.length.times do |i|
48
- assert_equal(models[i],@tagger.assign_tag(*tests[i]))
49
- end
50
- tests = []
51
- tests += [["vb","water"], ["nn", "runs"]]
52
- models.length.times do |i|
53
- result = @tagger.assign_tag(*tests[i])
54
- assert(EngTagger.hmm.keys.index(result))
55
- end
56
- end
57
-
58
- def test_clean_text
59
- test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
60
- model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
61
- assert_equal(model, @tagger.send(:clean_text, test))
62
- end
63
-
64
- def test_get_noun_phrases
65
- result = @tagger.get_noun_phrases(@@tagged)
66
- assert_instance_of(Hash, result)
67
- end
68
-
69
- def test_get_nouns
70
- result = @tagger.get_nouns(@@tagged)
71
- assert_instance_of(Hash, result)
72
- end
73
-
74
- def test_get_verbs
75
- expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
76
- result = @tagger.get_verbs(@@tagged)
77
- assert_equal(expected_result, result)
78
- end
79
-
80
- def test_get_adverbs
81
- expected_result = { "otherwise" => 1 }
82
- result = @tagger.get_adverbs(@@tagged)
83
- assert_equal(expected_result, result)
84
- end
85
-
86
- def test_get_interrogatives
87
- tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
88
- expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
89
- result = @tagger.get_interrogatives(tagged)
90
- assert_equal(expected_result, result)
91
- end
92
-
93
- def test_get_question_parts
94
- tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
95
- expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
96
- result = @tagger.get_question_parts(tagged)
97
- assert_equal(expected_result, result)
98
- end
99
-
100
- def test_get_conjunctions
101
- expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
102
- result = @tagger.get_conjunctions(@@tagged)
103
- assert_equal(expected_result, result)
104
- end
105
-
106
- def test_get_proper_nouns
107
- test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
108
- result = @tagger.get_proper_nouns(test)
109
- assert_instance_of(Hash, result)
110
- end
111
-
112
- def test_get_readable
113
- test = "I woke up to the sound of pouring rain."
114
- result = @tagger.get_readable(test)
115
- assert(String, result)
116
-
117
- test = "I woke up to the sound of pouring rain."
118
- result = @tagger.get_readable(test)
119
- expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
120
- assert_equal(expected_result, result)
121
- test = "I woke up with a <bad> word."
122
- result = @tagger.get_readable(test)
123
- expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
124
- assert_equal(expected_result, result)
125
- end
126
-
127
-
128
- def test_get_sentences
129
- result = @tagger.get_sentences(@@untagged)
130
- assert_equal(4, result.length)
131
- end
132
-
133
- def test_get_words
134
- @tagger.conf[:longest_noun_phrase] = 1
135
- result1 = @tagger.get_words(@@tagged)
136
- @tagger.conf[:longest_noun_phrase] = 10
137
- result2 = @tagger.get_words(@@tagged)
138
- assert_instance_of(Hash, result1)
139
- assert_instance_of(Hash, result2)
140
- end
141
-
142
- # Testing private instance methods
143
-
144
- def test_reset
145
- @tagger.conf[:current_tag] = 'nn'
146
- @tagger.send(:reset)
147
- assert_equal('pp', @tagger.conf[:current_tag])
148
- end
149
-
150
-
151
- def test_classify_unknown_word
152
- assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
153
- assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
154
- assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
155
- assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
156
- assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
157
- assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
158
- assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
159
- assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
160
- end
161
-
162
-
163
- def test_clean_word
164
- models = []; tests = []
165
- models += ["*NUM*"]
166
- models += ["Plays"]
167
- models += ["pleadingly"]
168
- tests += ["1973.0820", "Plays", "Pleadingly"]
169
- models.length.times do |i|
170
- assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
171
- end
172
- end
173
-
174
- def test_get_max_noun_phrases
175
- result = @tagger.send(:get_max_noun_phrases, @@tagged)
176
- assert_instance_of(Hash, result)
177
- end
178
-
179
- def test_get_max_noun_regex
180
- assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
181
- end
182
-
183
- def test_split_punct
184
- models = []; texts = []
185
- models << ["`", "test"]; texts << "`test"
186
- models << ["``", "test"]; texts << "\"test"
187
- models << ["`", "test"]; texts << "'test"
188
- models << ["''"]; texts << '"'
189
- models << ["test", "'"]; texts << "test' "
190
- models << ["-", "test", "-"]; texts << "---test-----"
191
- models << ["test", ",", "test"]; texts << "test,test"
192
- models << ["123,456"]; texts << "123,456"
193
- models << ["test", ":", "test"]; texts << "test:test"
194
- models << ["123", ":", "456"]; texts << "123:456"
195
- models << ["test1", "...", "test2"]; texts << "test1...test2"
196
- models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
197
- models << ["test", "#", "test"]; texts << "test#test"
198
- models << ["I", "'d", "like"]; texts << "I'd like"
199
- models << ["is", "n't", "so"]; texts << "isn't so"
200
- models << ["we", "'re", "all"]; texts << "we're all"
201
-
202
- texts.each_with_index do |text, index|
203
- assert_equal(models[index], @tagger.send(:split_punct, text))
204
- end
205
- end
206
-
207
- def test_split_sentences
208
- models = []; tests = []
209
- models << ["He", "is", "a", "u.s.", "army", "officer", "."]
210
- tests << ["He", "is", "a", "u.s.", "army", "officer."]
211
- models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
212
- tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
213
- models.length.times do |i|
214
- assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
215
- end
216
- end
217
-
218
- def test_stem
219
- word = "gets"
220
- old = @tagger.conf[:stem]
221
- @tagger.conf[:stem] = true
222
- assert_equal("get", @tagger.stem(word))
223
- # the following should not work since we memoize stem method
224
- # @tagger.conf[:stem] = false
225
- # assert_equal("gets", @tagger.stem(word))
226
- @tagger.conf[:stem] = old
227
- end
228
-
229
- def test_strip_tags
230
- assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
231
- end
232
-
233
- def test_valid_text
234
- text = nil
235
- assert(!@tagger.send(:valid_text, text))
236
- text = "this is test text"
237
- assert(@tagger.send(:valid_text, text))
238
- text = ""
239
- assert(!@tagger.send(:valid_text, text))
240
- end
241
-
242
- def test_override_default_params
243
- @tagger = EngTagger.new(:longest_noun_phrase => 3)
244
- assert_equal 3, @tagger.conf[:longest_noun_phrase]
245
- end
246
- end