engtagger 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/LICENSE.txt +340 -0
- data/Manifest.txt +13 -0
- data/README.txt +70 -0
- data/Rakefile +24 -0
- data/lib/engtagger.rb +729 -0
- data/lib/engtagger/porter.rb +196 -0
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +4467 -0
- data/lib/engtagger/tags.yml +45 -0
- data/lib/engtagger/unknown.yml +12 -0
- data/lib/engtagger/words.yml +43818 -0
- data/test/test_engtagger.rb +196 -0
- metadata +86 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
# Code Generated by ZenTest v. 3.9.2
|
2
|
+
# classname: asrt / meth = ratio%
|
3
|
+
# EngTagger: 0 / 24 = 0.00%
|
4
|
+
|
5
|
+
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
$LOAD_PATH << $ENGTAGGER_LIB
|
7
|
+
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
8
|
+
require 'engtagger'
|
9
|
+
|
10
|
+
class TestEngTagger < Test::Unit::TestCase
|
11
|
+
|
12
|
+
@@untagged =<<EOD
|
13
|
+
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
14
|
+
EOD
|
15
|
+
|
16
|
+
@@tagged =<<EOD
|
17
|
+
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
18
|
+
EOD
|
19
|
+
|
20
|
+
def setup
|
21
|
+
@tagger = EngTagger.new
|
22
|
+
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
23
|
+
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
24
|
+
if !File.exists?(tagpath) or !File.exists?(wordpath)
|
25
|
+
@tagger.install
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def text_get_ext
|
30
|
+
model = '<cd>[^<]+</cd}>\s*'
|
31
|
+
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_explain_tag
|
35
|
+
assert_equal("noun", EngTagger.explain_tag("nn"))
|
36
|
+
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_add_tags
|
40
|
+
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_assign_tag
|
44
|
+
models = []; tests = []
|
45
|
+
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
+
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
+
models.length.times do |i|
|
48
|
+
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
+
end
|
50
|
+
tests = []
|
51
|
+
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
+
models.length.times do |i|
|
53
|
+
result = @tagger.assign_tag(*tests[i])
|
54
|
+
assert(EngTagger.hmm.keys.index(result))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_classify_unknown_word
|
59
|
+
assert_equal("*LRB*", @tagger.classify_unknown_word("{"))
|
60
|
+
assert_equal("*NUM*", @tagger.classify_unknown_word("123.4567"))
|
61
|
+
assert_equal("*ORD*", @tagger.classify_unknown_word("40th"))
|
62
|
+
assert_equal("-abr-", @tagger.classify_unknown_word("GT-R"))
|
63
|
+
assert_equal("-hyp-adj-", @tagger.classify_unknown_word("extremely-high"))
|
64
|
+
assert_equal("-sym-", @tagger.classify_unknown_word("&&"))
|
65
|
+
assert_equal("-ing-", @tagger.classify_unknown_word("wikiing"))
|
66
|
+
assert_equal("-unknown-", @tagger.classify_unknown_word("asefasdf"))
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_clean_text
|
70
|
+
test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
|
71
|
+
model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
72
|
+
assert_equal(model, @tagger.clean_text(test))
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_clean_word
|
76
|
+
models = []; tests = []
|
77
|
+
models += ["*NUM*"]
|
78
|
+
models += ["Plays"]
|
79
|
+
models += ["pleadingly"]
|
80
|
+
tests += ["1973.0820", "Plays", "Pleadingly"]
|
81
|
+
models.length.times do |i|
|
82
|
+
assert_equal(models[i], @tagger.clean_word(tests[i]))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_get_max_noun_phrases
|
87
|
+
result = @tagger.get_max_noun_phrases(@@tagged)
|
88
|
+
assert_instance_of(Hash, result)
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_get_max_noun_regex
|
92
|
+
assert_instance_of(Regexp, @tagger.get_max_noun_regex)
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_get_noun_phrases
|
96
|
+
result = @tagger.get_noun_phrases(@@tagged)
|
97
|
+
assert_instance_of(Hash, result)
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_get_nouns
|
101
|
+
result = @tagger.get_nouns(@@tagged)
|
102
|
+
assert_instance_of(Hash, result)
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_get_proper_nouns
|
106
|
+
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
107
|
+
result = @tagger.get_proper_nouns(test)
|
108
|
+
assert_instance_of(Hash, result)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_get_readable
|
112
|
+
test = "I woke up to the sound of pouring rain."
|
113
|
+
result = @tagger.get_readable(test)
|
114
|
+
assert(String, result)
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_get_sentences
|
118
|
+
result = @tagger.get_sentences(@@untagged)
|
119
|
+
assert_equal(4, result.length)
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_get_words
|
123
|
+
@tagger.conf[:longest_noun_phrase] = 1
|
124
|
+
result1 = @tagger.get_words(@@tagged)
|
125
|
+
@tagger.conf[:longest_noun_phrase] = 10
|
126
|
+
result2 = @tagger.get_words(@@tagged)
|
127
|
+
assert_instance_of(Hash, result1)
|
128
|
+
assert_instance_of(Hash, result2)
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_reset
|
132
|
+
@tagger.conf[:current_tag] = 'nn'
|
133
|
+
@tagger.reset
|
134
|
+
assert_equal('pp', @tagger.conf[:current_tag])
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_split_punct
|
138
|
+
models = []; texts = []
|
139
|
+
models << ["`", "test"]; texts << "`test"
|
140
|
+
models << ["``", "test"]; texts << "\"test"
|
141
|
+
models << ["`", "test"]; texts << "'test"
|
142
|
+
models << ["''"]; texts << '"'
|
143
|
+
models << ["test", "'"]; texts << "test' "
|
144
|
+
models << ["-", "test", "-"]; texts << "---test-----"
|
145
|
+
models << ["test", ",", "test"]; texts << "test,test"
|
146
|
+
models << ["123,456"]; texts << "123,456"
|
147
|
+
models << ["test", ":"]; texts << "test:"
|
148
|
+
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
149
|
+
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
150
|
+
models << ["test", "#", "test"]; texts << "test#test"
|
151
|
+
models << ["I", "'d", "like"]; texts << "I'd like"
|
152
|
+
models << ["is", "n't", "so"]; texts << "isn't so"
|
153
|
+
models << ["we", "'re", "all"]; texts << "we're all"
|
154
|
+
|
155
|
+
texts.each_with_index do |text, index|
|
156
|
+
assert_equal(models[index], @tagger.split_punct(text))
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_split_sentences
|
161
|
+
models = []; tests = []
|
162
|
+
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
163
|
+
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
164
|
+
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
165
|
+
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
166
|
+
models.length.times do |i|
|
167
|
+
assert_equal(models[i], @tagger.split_sentences(tests[i]))
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_stem
|
172
|
+
word = "gets"
|
173
|
+
old = @tagger.conf[:stem]
|
174
|
+
@tagger.conf[:stem] = true
|
175
|
+
assert_equal("get", @tagger.stem(word))
|
176
|
+
# the following should not work since we memoize stem method
|
177
|
+
# @tagger.conf[:stem] = false
|
178
|
+
# assert_equal("gets", @tagger.stem(word))
|
179
|
+
@tagger.conf[:stem] = old
|
180
|
+
end
|
181
|
+
|
182
|
+
def test_strip_tags
|
183
|
+
assert_instance_of(String, @tagger.strip_tags(@@tagged))
|
184
|
+
end
|
185
|
+
|
186
|
+
def test_valid_text
|
187
|
+
text = nil
|
188
|
+
assert(!@tagger.valid_text(text))
|
189
|
+
text = "this is test text"
|
190
|
+
assert(@tagger.valid_text(text))
|
191
|
+
text = ""
|
192
|
+
assert(!@tagger.valid_text(text))
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# Number of errors detected: 24
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: engtagger
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yoichiro Hasebe
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-05-08 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hoe
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 1.5.1
|
32
|
+
version:
|
33
|
+
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values. The tagger assigns appropriate tags based on conditional probabilities--it examines the preceding tag to determine the appropriate tag for the current word. Unknown words are classified according to word morphology or can be set to be treated as nouns or other parts of speech. The tagger also extracts as many nouns and noun phrases as it can, using a set of regular expressions.
|
34
|
+
email: yohasebe@gmail.com
|
35
|
+
executables: []
|
36
|
+
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- History.txt
|
41
|
+
- LICENSE.txt
|
42
|
+
- Manifest.txt
|
43
|
+
- README.txt
|
44
|
+
files:
|
45
|
+
- History.txt
|
46
|
+
- LICENSE.txt
|
47
|
+
- Manifest.txt
|
48
|
+
- README.txt
|
49
|
+
- Rakefile
|
50
|
+
- lib/engtagger.rb
|
51
|
+
- lib/engtagger/porter.rb
|
52
|
+
- lib/engtagger/pos_tags.hash
|
53
|
+
- lib/engtagger/pos_words.hash
|
54
|
+
- lib/engtagger/tags.yml
|
55
|
+
- lib/engtagger/unknown.yml
|
56
|
+
- lib/engtagger/words.yml
|
57
|
+
- test/test_engtagger.rb
|
58
|
+
has_rdoc: true
|
59
|
+
homepage: http://engtagger.rubyforge.org
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options:
|
62
|
+
- --main
|
63
|
+
- README.txt
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project: engtagger
|
81
|
+
rubygems_version: 1.1.0
|
82
|
+
signing_key:
|
83
|
+
specification_version: 2
|
84
|
+
summary: English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
85
|
+
test_files:
|
86
|
+
- test/test_engtagger.rb
|