engtagger 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,196 @@
1
+ # Code Generated by ZenTest v. 3.9.2
2
+ # classname: asrt / meth = ratio%
3
+ # EngTagger: 0 / 24 = 0.00%
4
+
5
+ $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
6
+ $LOAD_PATH << $ENGTAGGER_LIB
7
+ require 'test/unit' unless defined? $ZENTEST and $ZENTEST
8
+ require 'engtagger'
9
+
10
+ class TestEngTagger < Test::Unit::TestCase
11
+
12
+ @@untagged =<<EOD
13
+ Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
14
+ EOD
15
+
16
+ @@tagged =<<EOD
17
+ <nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
18
+ EOD
19
+
20
+ def setup
21
+ @tagger = EngTagger.new
22
+ tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
23
+ wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
24
+ if !File.exists?(tagpath) or !File.exists?(wordpath)
25
+ @tagger.install
26
+ end
27
+ end
28
+
29
+ def text_get_ext
30
+ model = '<cd>[^<]+</cd}>\s*'
31
+ assert_equal(model, EngTagger.get_ext(model, "cd"))
32
+ end
33
+
34
+ def test_explain_tag
35
+ assert_equal("noun", EngTagger.explain_tag("nn"))
36
+ assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
37
+ end
38
+
39
+ def test_add_tags
40
+ assert_instance_of(String, @tagger.add_tags(@@untagged))
41
+ end
42
+
43
+ def test_assign_tag
44
+ models = []; tests = []
45
+ models += [@tagger.conf[:unknown_word_tag], "sym"]
46
+ tests += [["pp","-unknown-"], ["pp", "-sym-"]]
47
+ models.length.times do |i|
48
+ assert_equal(models[i],@tagger.assign_tag(*tests[i]))
49
+ end
50
+ tests = []
51
+ tests += [["vb","water"], ["nn", "runs"]]
52
+ models.length.times do |i|
53
+ result = @tagger.assign_tag(*tests[i])
54
+ assert(EngTagger.hmm.keys.index(result))
55
+ end
56
+ end
57
+
58
+ def test_classify_unknown_word
59
+ assert_equal("*LRB*", @tagger.classify_unknown_word("{"))
60
+ assert_equal("*NUM*", @tagger.classify_unknown_word("123.4567"))
61
+ assert_equal("*ORD*", @tagger.classify_unknown_word("40th"))
62
+ assert_equal("-abr-", @tagger.classify_unknown_word("GT-R"))
63
+ assert_equal("-hyp-adj-", @tagger.classify_unknown_word("extremely-high"))
64
+ assert_equal("-sym-", @tagger.classify_unknown_word("&&"))
65
+ assert_equal("-ing-", @tagger.classify_unknown_word("wikiing"))
66
+ assert_equal("-unknown-", @tagger.classify_unknown_word("asefasdf"))
67
+ end
68
+
69
+ def test_clean_text
70
+ test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
71
+ model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
72
+ assert_equal(model, @tagger.clean_text(test))
73
+ end
74
+
75
+ def test_clean_word
76
+ models = []; tests = []
77
+ models += ["*NUM*"]
78
+ models += ["Plays"]
79
+ models += ["pleadingly"]
80
+ tests += ["1973.0820", "Plays", "Pleadingly"]
81
+ models.length.times do |i|
82
+ assert_equal(models[i], @tagger.clean_word(tests[i]))
83
+ end
84
+ end
85
+
86
+ def test_get_max_noun_phrases
87
+ result = @tagger.get_max_noun_phrases(@@tagged)
88
+ assert_instance_of(Hash, result)
89
+ end
90
+
91
+ def test_get_max_noun_regex
92
+ assert_instance_of(Regexp, @tagger.get_max_noun_regex)
93
+ end
94
+
95
+ def test_get_noun_phrases
96
+ result = @tagger.get_noun_phrases(@@tagged)
97
+ assert_instance_of(Hash, result)
98
+ end
99
+
100
+ def test_get_nouns
101
+ result = @tagger.get_nouns(@@tagged)
102
+ assert_instance_of(Hash, result)
103
+ end
104
+
105
+ def test_get_proper_nouns
106
+ test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
107
+ result = @tagger.get_proper_nouns(test)
108
+ assert_instance_of(Hash, result)
109
+ end
110
+
111
+ def test_get_readable
112
+ test = "I woke up to the sound of pouring rain."
113
+ result = @tagger.get_readable(test)
114
+ assert(String, result)
115
+ end
116
+
117
+ def test_get_sentences
118
+ result = @tagger.get_sentences(@@untagged)
119
+ assert_equal(4, result.length)
120
+ end
121
+
122
+ def test_get_words
123
+ @tagger.conf[:longest_noun_phrase] = 1
124
+ result1 = @tagger.get_words(@@tagged)
125
+ @tagger.conf[:longest_noun_phrase] = 10
126
+ result2 = @tagger.get_words(@@tagged)
127
+ assert_instance_of(Hash, result1)
128
+ assert_instance_of(Hash, result2)
129
+ end
130
+
131
+ def test_reset
132
+ @tagger.conf[:current_tag] = 'nn'
133
+ @tagger.reset
134
+ assert_equal('pp', @tagger.conf[:current_tag])
135
+ end
136
+
137
+ def test_split_punct
138
+ models = []; texts = []
139
+ models << ["`", "test"]; texts << "`test"
140
+ models << ["``", "test"]; texts << "\"test"
141
+ models << ["`", "test"]; texts << "'test"
142
+ models << ["''"]; texts << '"'
143
+ models << ["test", "'"]; texts << "test' "
144
+ models << ["-", "test", "-"]; texts << "---test-----"
145
+ models << ["test", ",", "test"]; texts << "test,test"
146
+ models << ["123,456"]; texts << "123,456"
147
+ models << ["test", ":"]; texts << "test:"
148
+ models << ["test1", "...", "test2"]; texts << "test1...test2"
149
+ models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
150
+ models << ["test", "#", "test"]; texts << "test#test"
151
+ models << ["I", "'d", "like"]; texts << "I'd like"
152
+ models << ["is", "n't", "so"]; texts << "isn't so"
153
+ models << ["we", "'re", "all"]; texts << "we're all"
154
+
155
+ texts.each_with_index do |text, index|
156
+ assert_equal(models[index], @tagger.split_punct(text))
157
+ end
158
+ end
159
+
160
+ def test_split_sentences
161
+ models = []; tests = []
162
+ models << ["He", "is", "a", "u.s.", "army", "officer", "."]
163
+ tests << ["He", "is", "a", "u.s.", "army", "officer."]
164
+ models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
165
+ tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
166
+ models.length.times do |i|
167
+ assert_equal(models[i], @tagger.split_sentences(tests[i]))
168
+ end
169
+ end
170
+
171
+ def test_stem
172
+ word = "gets"
173
+ old = @tagger.conf[:stem]
174
+ @tagger.conf[:stem] = true
175
+ assert_equal("get", @tagger.stem(word))
176
+ # the following should not work since we memoize stem method
177
+ # @tagger.conf[:stem] = false
178
+ # assert_equal("gets", @tagger.stem(word))
179
+ @tagger.conf[:stem] = old
180
+ end
181
+
182
+ def test_strip_tags
183
+ assert_instance_of(String, @tagger.strip_tags(@@tagged))
184
+ end
185
+
186
+ def test_valid_text
187
+ text = nil
188
+ assert(!@tagger.valid_text(text))
189
+ text = "this is test text"
190
+ assert(@tagger.valid_text(text))
191
+ text = ""
192
+ assert(!@tagger.valid_text(text))
193
+ end
194
+ end
195
+
196
+ # Number of errors detected: 24
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: engtagger
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Yoichiro Hasebe
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-05-08 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hoe
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.5.1
32
+ version:
33
+ description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values. The tagger assigns appropriate tags based on conditional probabilities--it examines the preceding tag to determine the appropriate tag for the current word. Unknown words are classified according to word morphology or can be set to be treated as nouns or other parts of speech. The tagger also extracts as many nouns and noun phrases as it can, using a set of regular expressions.
34
+ email: yohasebe@gmail.com
35
+ executables: []
36
+
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - History.txt
41
+ - LICENSE.txt
42
+ - Manifest.txt
43
+ - README.txt
44
+ files:
45
+ - History.txt
46
+ - LICENSE.txt
47
+ - Manifest.txt
48
+ - README.txt
49
+ - Rakefile
50
+ - lib/engtagger.rb
51
+ - lib/engtagger/porter.rb
52
+ - lib/engtagger/pos_tags.hash
53
+ - lib/engtagger/pos_words.hash
54
+ - lib/engtagger/tags.yml
55
+ - lib/engtagger/unknown.yml
56
+ - lib/engtagger/words.yml
57
+ - test/test_engtagger.rb
58
+ has_rdoc: true
59
+ homepage: http://engtagger.rubyforge.org
60
+ post_install_message:
61
+ rdoc_options:
62
+ - --main
63
+ - README.txt
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ requirements: []
79
+
80
+ rubyforge_project: engtagger
81
+ rubygems_version: 1.1.0
82
+ signing_key:
83
+ specification_version: 2
84
+ summary: English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
85
+ test_files:
86
+ - test/test_engtagger.rb