rbtagger 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/brill/tagger.rb +40 -2
- data/lib/rbtagger/version.rb +1 -1
- data/test/test_rule_tagger.rb +87 -10
- data/website/index.html +5 -2
- metadata +2 -2
data/lib/brill/tagger.rb
CHANGED
@@ -12,14 +12,50 @@ module Brill
|
|
12
12
|
# returns similar results as tag, but further reduced by only selecting nouns
|
13
13
|
def suggest( text, max = 10 )
|
14
14
|
tags = tag(text)
|
15
|
+
#puts tags.inspect
|
16
|
+
ptag = [nil,nil]
|
17
|
+
# join NNP's together for names
|
18
|
+
reduced_tags = []
|
19
|
+
mappings = {} # keep a mapping of the joined words to expand
|
20
|
+
tags.each{|tag|
|
21
|
+
if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
|
22
|
+
ptag[0] += " " + tag.first
|
23
|
+
# before combining these two create a mapping for each word to each word
|
24
|
+
words = ptag.first.split(/\s/)
|
25
|
+
i = 0
|
26
|
+
#puts words.inspect
|
27
|
+
until (i + 1) == words.size
|
28
|
+
mappings[words[i]] = ptag.first
|
29
|
+
mappings[words[i+1]] = ptag.first
|
30
|
+
i += 1
|
31
|
+
end
|
32
|
+
#puts mappings.inspect
|
33
|
+
elsif tag.last == 'NNP'
|
34
|
+
ptag = tag
|
35
|
+
elsif tag.last != 'NNP' and ptag.first != nil
|
36
|
+
reduced_tags << ptag
|
37
|
+
reduced_tags << tag if tag.last.match( /^\w+$/ ) and tag.first.match(/^\w+$/)
|
38
|
+
ptag = [nil,nil]
|
39
|
+
elsif tag.last.match( /^\w+$/ ) and tag.first.match(/^\w+$/)
|
40
|
+
reduced_tags << tag
|
41
|
+
end
|
42
|
+
}
|
43
|
+
# now expand any NNP that appear
|
44
|
+
tags = reduced_tags.map{|tag|
|
45
|
+
if tag.last == 'NNP'
|
46
|
+
#puts "#{tag.first} => #{mappings[tag.first]}"
|
47
|
+
tag[0] = mappings[tag.first] if mappings.key?(tag.first)
|
48
|
+
end
|
49
|
+
tag
|
50
|
+
}
|
15
51
|
results = tags.select{|tag| tag.last.match(/NN/) }
|
16
52
|
if results.size > max
|
17
53
|
counts = {}
|
18
54
|
tags = []
|
19
55
|
results.each {|tag| counts[tag.first] = 0 }
|
20
56
|
results.each do |tag|
|
21
|
-
counts[tag.first]
|
22
|
-
|
57
|
+
tags << tag if counts[tag.first] == 0
|
58
|
+
counts[tag.first] += tag.last == 'NNP' ? 3 : (tag.last == 'NNS' ? 2 : 1)
|
23
59
|
end
|
24
60
|
tags.map!{|tag| [tag.first, tag.last,counts[tag.first]]}
|
25
61
|
t = 1
|
@@ -41,6 +77,8 @@ module Brill
|
|
41
77
|
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
42
78
|
#
|
43
79
|
def tag( text )
|
80
|
+
text = text.gsub(/dont/,"don't").gsub(/Dont/,"Don't")
|
81
|
+
text = text.gsub(/youre/,"you're")
|
44
82
|
tokens = Brill::Tagger.tokenize( text )
|
45
83
|
tags = Brill::Tagger.tag_start( tokens )
|
46
84
|
|
data/lib/rbtagger/version.rb
CHANGED
data/test/test_rule_tagger.rb
CHANGED
@@ -23,24 +23,96 @@ Share your feelings
|
|
23
23
|
Allow yourself time to discuss the emotional consequences of your illness and treatment with family, friends, your doctor and, if necessary, a professional therapist. Many patients also find antidepressants helpful during treatment.
|
24
24
|
Stay connected
|
25
25
|
Although many newly diagnosed patients fear they will not be able to keep working during treatment, this is usually not the case. Working, even at a reduced schedule, helps you maintain valuable social connections and weekly structure.
|
26
|
+
)
|
27
|
+
SAMPLE_DOC2=%q(
|
28
|
+
Britney Spears was granted a change in her visitation schedule with her sons Sean Preston and Jayden James at a hearing Tuesday.
|
29
|
+
"There was a change in visitation status that was ordered by Commissioner Gordon this morning," Los Angeles Superior Court spokesperson Alan Parachini confirmed after the hearing, which both Kevin Federline and her father (and co-conservator) Jamie Spears attended. (Britney and Kevin did not address each other during the hearing.)
|
30
|
+
The details of her visitation, however, are unclear.
|
31
|
+
"I'm not at liberty to answer any questions about the nature of that change," Parachini said. (TMZ.com had reported that Spears wanted overnight visits.)
|
32
|
+
Asked by Us if she were happy with the court outcome, Spears (clutching an Ed Hardy purse) smiled and told Us, "Yes."
|
33
|
+
Next up: A status hearing set for July 15.
|
34
|
+
The couple last appeared in court May 6. Spears was granted extended visitation — three days a week from 9 a.m. to 5 p.m. — of Sean Preston, 2, and Jayden James, 20 months.
|
26
35
|
)
|
27
36
|
def setup
|
28
37
|
if !defined?($tagger)
|
29
|
-
puts "loading tagger..."
|
30
38
|
$rtagger = Brill::Tagger.new( File.join(File.dirname(__FILE__),"LEXICON"),
|
31
|
-
|
32
|
-
|
33
|
-
puts "tagger loaded!"
|
39
|
+
File.join(File.dirname(__FILE__),"LEXICALRULEFILE"),
|
40
|
+
File.join(File.dirname(__FILE__),"CONTEXTUALRULEFILE") )
|
34
41
|
end
|
35
42
|
end
|
36
43
|
|
37
44
|
def test_simple_tagger
|
38
45
|
pairs = tagger.tag( SAMPLE_DOC )
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
46
|
+
tags = [["", ")"], ["", ")"], ["Take", "VB"], ["an", "DT"], ["active", "JJ"], ["role", "NN"], ["in", "IN"],
|
47
|
+
["your", "PRP$"], ["care", "NN"], ["When", "WRB"], ["it", "PRP"], ["comes", "VBZ"], ["to", "TO"],
|
48
|
+
["making", "VBG"], ["decisions", "NNS"], ["about", "IN"], ["the", "DT"], ["goals", "NNS"], ["and", "CC"],
|
49
|
+
["direction", "NN"], ["of", "IN"], ["treatment", "NN"], [",", ","], ["", ")"], ["do", "VBP"], ["", ")"],
|
50
|
+
["n't", "RB"], ["sit", "VB"], ["back.", "CD"], ["Work", "NN"], ["closely", "RB"], ["and", "CC"],
|
51
|
+
["actively", "RB"], ["with", "IN"], ["your", "PRP$"], ["oncologist", "NN"], ["and", "CC"], ["the", "DT"],
|
52
|
+
["rest", "NN"], ["of", "IN"], ["your", "PRP$"], ["medical", "JJ"], ["team.", "JJ"], ["Do", "VBP"],
|
53
|
+
["", ")"], ["n't", "RB"], ["overlook", "VB"], ["clinical", "JJ"], ["trials", "NNS"], ["If", "IN"],
|
54
|
+
["you", "PRP"], ["'re", "VBP"], ["eligible", "JJ"], ["to", "TO"], ["enroll", "VB"], ["in", "IN"],
|
55
|
+
["clinical", "JJ"], ["trials", "NNS"], [",", ","], ["", ")"], ["select", "VB"], ["an", "DT"],
|
56
|
+
["oncologist", "NN"], ["who", "WP"], ["participates", "VBZ"], ["in", "IN"], ["them.", "JJ"],
|
57
|
+
["Patients", "NNS"], ["who", "WP"], ["enroll", "VBP"], ["in", "IN"], ["clinical", "JJ"],
|
58
|
+
["studies", "NNS"], ["receive", "VBP"], ["closer", "JJR"], ["follow-up", "NN"], [",", ","], ["", ")"],
|
59
|
+
["the", "DT"], ["highest", "JJS"], ["standard-of-care", "JJ"], ["treatment", "NN"], ["and", "CC"],
|
60
|
+
["access", "NN"], ["to", "TO"], ["experimental", "JJ"], ["therapies", "NNS"], ["at", "IN"], ["no", "DT"],
|
61
|
+
["extra", "JJ"], ["cost.", "NNP"], ["Maximize", "NNP"], ["your", "PRP$"], ["nutrition", "NN"],
|
62
|
+
["strategy", "NN"], ["Doing", "NNP"], ["your", "PRP$"], ["best", "JJS"], ["to", "TO"], ["eat", "VB"],
|
63
|
+
["a", "DT"], ["healthy", "JJ"], [",", ","], ["", ")"], ["well-balanced", "JJ"], ["diet", "NN"],
|
64
|
+
["is", "VBZ"], ["vital", "JJ"], ["to", "TO"], ["prompt", "VB"], ["healing", "NN"], ["after", "IN"],
|
65
|
+
["surgery", "NN"], ["and", "CC"], ["for", "IN"], ["recovery", "NN"], ["from", "IN"], ["radiation", "NN"],
|
66
|
+
["or", "CC"], ["chemotherapy.", "JJ"], ["Many", "JJ"], ["oncology", "NN"], ["practices", "NNS"],
|
67
|
+
["employ", "VBP"], ["registered", "VBN"], ["dieticians", "NNS"], ["who", "WP"], ["can", "MD"],
|
68
|
+
["help", "VB"], ["you", "PRP"], ["optimize", "VB"], ["your", "PRP$"], ["nutrition.", "JJ"],
|
69
|
+
["Steer", "VB"], ["clear", "JJ"], ["of", "IN"], ["", ")"], ["``", "``"], ["natural", "JJ"],
|
70
|
+
["cures", "NNS"], ["''", "''"], ["", ")"], ["Before", "IN"], ["trying", "VBG"], ["nutritional", "JJ"],
|
71
|
+
["supplements", "NNS"], ["or", "CC"], ["herbal", "JJ"], ["remedies", "NNS"], [",", ","], ["", ")"],
|
72
|
+
["be", "VB"], ["sure", "JJ"], ["to", "TO"], ["discuss", "VB"], ["your", "PRP$"], ["plans", "NNS"],
|
73
|
+
["with", "IN"], ["a", "DT"], ["doctor.", "JJ"], ["Most", "JJS"], ["have", "VBP"], ["not", "RB"],
|
74
|
+
["been", "VBN"], ["tested", "VBN"], ["in", "IN"], ["clinical", "JJ"], ["studies", "NNS"], [",", ","],
|
75
|
+
["", ")"], ["and", "CC"], ["some", "DT"], ["may", "MD"], ["actually", "RB"], ["interfere", "VB"],
|
76
|
+
["with", "IN"], ["your", "PRP$"], ["treatment.", "JJ"], ["Build", "VB"], ["a", "DT"], ["stronger", "JJR"],
|
77
|
+
["body", "NN"], ["Even", "RB"], ["walking", "VBG"], ["regularly", "RB"], ["is", "VBZ"], ["can", "MD"],
|
78
|
+
["help", "VB"], ["you", "PRP"], ["minimize", "VB"], ["long-term", "JJ"], ["muscle", "NN"],
|
79
|
+
["weakness", "NN"], ["caused", "VBN"], ["by", "IN"], ["illness", "NN"], ["or", "CC"],
|
80
|
+
["de-conditioning.", "NNP"], ["Focus", "NNP"], ["on", "IN"], ["overall", "JJ"], ["health", "NN"],
|
81
|
+
["Patients", "NNS"], ["may", "MD"], ["be", "VB"], ["cured", "VBN"], ["of", "IN"], ["cancer", "NN"],
|
82
|
+
["but", "CC"], ["still", "JJ"], ["face", "NN"], ["life-threatening", "JJ"], ["medical", "JJ"],
|
83
|
+
["problems", "NNS"], ["that", "WDT"], ["are", "VBP"], ["underemphasized", "JJ"], ["during", "IN"],
|
84
|
+
["cancer", "NN"], ["treatments", "NNS"], [",", ","], ["", ")"], ["such", "JJ"], ["as", "IN"],
|
85
|
+
["diabetes", "NN"], [",", ","], ["", ")"], ["high", "JJ"], ["blood", "NN"], ["pressure", "NN"],
|
86
|
+
["and", "CC"], ["heart", "NN"], ["disease.", "JJ"], ["Continue", "VB"], ["to", "TO"], ["monitor", "VB"],
|
87
|
+
["your", "PRP$"], ["overall", "JJ"], ["health.", "JJ"], ["Put", "NN"], ["the", "DT"], ["fire", "NN"],
|
88
|
+
["out", "IN"], ["for", "IN"], ["good", "JJ"], ["Smoking", "NNP"], ["impairs", "NNS"], ["healing", "NN"],
|
89
|
+
["after", "IN"], ["surgery", "NN"], ["and", "CC"], ["radiation", "NN"], ["and", "CC"], ["increases", "NNS"],
|
90
|
+
["your", "PRP$"], ["risk", "NN"], ["of", "IN"], ["cardiovascular", "JJ"], ["disease", "NN"], ["and", "CC"],
|
91
|
+
["many", "JJ"], ["types", "NNS"], ["of", "IN"], ["cancers.", "CD"], ["Ask", "VB"], ["your", "PRP$"],
|
92
|
+
["doctor", "NN"], ["for", "IN"], ["help", "NN"], ["identifying", "VBG"], ["and", "CC"], ["obtaining", "VBG"],
|
93
|
+
["the", "DT"], ["most", "RBS"], ["appropriate", "JJ"], ["cessation", "NN"], ["aids.", "NNP"], ["Map", "NNP"],
|
94
|
+
["a", "DT"], ["healthy", "JJ"], ["future", "NN"], ["Once", "RB"], ["youve", "VBP"], ["completed", "VBN"],
|
95
|
+
["treatment", "NN"], [",", ","], ["", ")"], ["discuss", "VB"], ["appropriate", "JJ"], ["follow-up", "NN"],
|
96
|
+
["plans", "NNS"], ["with", "IN"], ["your", "PRP$"], ["doctor", "NN"], ["and", "CC"], ["keep", "VB"],
|
97
|
+
["track", "NN"], ["of", "IN"], ["them", "PRP"], ["yourself.", "CD"], ["Intensified", "JJ"], ["screening", "NN"],
|
98
|
+
["over", "IN"], ["many", "JJ"], ["years", "NNS"], ["is", "VBZ"], ["frequently", "RB"], ["recommended", "VBN"],
|
99
|
+
["to", "TO"], ["identify", "VB"], ["and", "CC"], ["treat", "VB"], ["a", "DT"], ["recurrence", "NN"], ["early", "JJ"],
|
100
|
+
["on.", "CD"], ["Share", "VB"], ["your", "PRP$"], ["feelings", "NNS"], ["Allow", "VB"], ["yourself", "PRP"],
|
101
|
+
["time", "NN"], ["to", "TO"], ["discuss", "VB"], ["the", "DT"], ["emotional", "JJ"], ["consequences", "NNS"],
|
102
|
+
["of", "IN"], ["your", "PRP$"], ["illness", "NN"], ["and", "CC"], ["treatment", "NN"], ["with", "IN"],
|
103
|
+
["family", "NN"], [",", ","], ["", ")"], ["friends", "NNS"], [",", ","], ["", ")"], ["your", "PRP$"],
|
104
|
+
["doctor", "NN"], ["and", "CC"], [",", ","], ["", ")"], ["if", "IN"], ["necessary", "JJ"], [",", ","],
|
105
|
+
["", ")"], ["a", "DT"], ["professional", "JJ"], ["therapist.", "JJ"], ["Many", "JJ"], ["patients", "NNS"],
|
106
|
+
["also", "RB"], ["find", "VBP"], ["antidepressants", "NNS"], ["helpful", "JJ"], ["during", "IN"],
|
107
|
+
["treatment.", "JJ"], ["Stay", "VB"], ["connected", "VBN"], ["Although", "IN"], ["many", "JJ"],
|
108
|
+
["newly", "RB"], ["diagnosed", "VBN"], ["patients", "NNS"], ["fear", "VBP"], ["they", "PRP"], ["will", "MD"],
|
109
|
+
["not", "RB"], ["be", "VB"], ["able", "JJ"], ["to", "TO"], ["keep", "VB"], ["working", "VBG"], ["during", "IN"],
|
110
|
+
["treatment", "NN"], [",", ","], ["", ")"], ["this", "DT"], ["is", "VBZ"], ["usually", "RB"], ["not", "RB"],
|
111
|
+
["the", "DT"], ["case.", "CD"], ["Working", "NNP"], [",", ","], ["", ")"], ["even", "RB"], ["at", "IN"],
|
112
|
+
["a", "DT"], ["reduced", "VBN"], ["schedule", "NN"], [",", ","], ["", ")"], ["helps", "VBZ"], ["you", "PRP"],
|
113
|
+
["maintain", "VBP"], ["valuable", "JJ"], ["social", "JJ"], ["connections", "NNS"], ["and", "CC"],
|
114
|
+
["weekly", "JJ"], ["structure", "NN"], [".", "."]]
|
115
|
+
assert_equal tags, pairs
|
44
116
|
end
|
45
117
|
|
46
118
|
def test_multiple_docs
|
@@ -56,7 +128,12 @@ Although many newly diagnosed patients fear they will not be able to keep workin
|
|
56
128
|
|
57
129
|
def test_suggest
|
58
130
|
results = tagger.suggest( SAMPLE_DOC )
|
59
|
-
|
131
|
+
# puts results.inspect
|
132
|
+
assert results.include?(["treatment", "NN", 5])
|
133
|
+
results = tagger.suggest( SAMPLE_DOC2 )
|
134
|
+
assert results.include?(["Britney Spears", "NNP", 6])
|
135
|
+
assert results.include?(["Jamie Spears", "NNP", 12])
|
136
|
+
# puts results.inspect
|
60
137
|
end
|
61
138
|
|
62
139
|
private
|
data/website/index.html
CHANGED
@@ -160,7 +160,7 @@
|
|
160
160
|
<h1>rbtagger</h1>
|
161
161
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/rbtagger"; return false'>
|
162
162
|
<p>Get Version</p>
|
163
|
-
<a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.
|
163
|
+
<a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.5</a>
|
164
164
|
</div>
|
165
165
|
<h4 style="float:right;padding-right:10px;"> → ‘rbtagger’</h4>
|
166
166
|
|
@@ -194,6 +194,9 @@ gem install rbtagger
|
|
194
194
|
<span class="ident">docs</span><span class="punct">.</span><span class="ident">each</span> <span class="keyword">do</span><span class="punct">|</span><span class="ident">doc</span><span class="punct">|</span>
|
195
195
|
<span class="ident">tagger</span><span class="punct">.</span><span class="ident">tag</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">read</span><span class="punct">(</span> <span class="ident">doc</span> <span class="punct">)</span> <span class="punct">)</span>
|
196
196
|
<span class="keyword">end</span>
|
197
|
+
|
198
|
+
<span class="ident">tagger</span><span class="punct">.</span><span class="ident">suggest</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">read</span><span class="punct">("</span><span class="string">sample.txt</span><span class="punct">")</span> <span class="punct">)</span>
|
199
|
+
<span class="punct">=></span> <span class="punct">[["</span><span class="string">doctor</span><span class="punct">",</span> <span class="punct">"</span><span class="string">NN</span><span class="punct">",</span> <span class="number">3</span><span class="punct">],</span> <span class="punct">["</span><span class="string">treatment</span><span class="punct">",</span> <span class="punct">"</span><span class="string">NN</span><span class="punct">",</span> <span class="number">5</span><span class="punct">]]</span>
|
197
200
|
</pre>
|
198
201
|
|
199
202
|
<h4>Using the word tagger</h4>
|
@@ -242,7 +245,7 @@ rake install_gem</pre>
|
|
242
245
|
|
243
246
|
<p>Comments are welcome. Send an email to <a href="mailto:rb-brill-tagger@googlegroups.com">Todd A. Fisher</a> email via the <a href="http://groups.google.com/group/rb-brill-tagger">forum</a></p>
|
244
247
|
<p class="coda">
|
245
|
-
<a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>,
|
248
|
+
<a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>, 23rd June 2008<br>
|
246
249
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
|
247
250
|
</p>
|
248
251
|
</div>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Todd A. Fisher
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-06-
|
12
|
+
date: 2008-06-25 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|