rbtagger 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
data/lib/brill/tagger.rb CHANGED
@@ -9,6 +9,34 @@ module Brill
9
9
  Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
10
10
  end
11
11
 
12
+ # returns similar results as tag, but further reduced by only selecting nouns
13
+ def suggest( text, max = 10 )
14
+ tags = tag(text)
15
+ results = tags.select{|tag| tag.last.match(/NN/) }
16
+ if results.size > max
17
+ counts = {}
18
+ tags = []
19
+ results.each {|tag| counts[tag.first] = 0 }
20
+ results.each do |tag|
21
+ counts[tag.first] += 1
22
+ tags << tag if counts[tag.first] == 1
23
+ end
24
+ tags.map!{|tag| [tag.first, tag.last,counts[tag.first]]}
25
+ t = 1
26
+ until tags.size <= max
27
+ tags = tags.sort_by{|tag| tag.last}.select{|tag| tag.last > t }
28
+ t += 1
29
+ if t == 5
30
+ tags = tags.reverse[0..max]
31
+ break
32
+ end
33
+ end
34
+ tags
35
+ else
36
+ results
37
+ end
38
+ end
39
+
12
40
  # Tag a body of text
13
41
  # returns an array like [[token,tag],[token,tag]...[token,tag]]
14
42
  #
@@ -2,7 +2,7 @@ module RbTagger #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 4
5
+ TINY = 5
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/test/test_helper.rb CHANGED
@@ -1,2 +1,5 @@
1
1
  require 'test/unit'
2
- require File.dirname(__FILE__) + '/../lib/rbtagger'
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','ext','rule_tagger')
3
+ $:.unshift File.join(File.dirname(__FILE__),'..','ext','word_tagger')
4
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
5
+ require 'rbtagger'
@@ -54,6 +54,11 @@ Although many newly diagnosed patients fear they will not be able to keep workin
54
54
  puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
55
55
  end
56
56
 
57
+ def test_suggest
58
+ results = tagger.suggest( SAMPLE_DOC )
59
+ assert_equal [["doctor", "NN", 3], ["treatment", "NN", 5]], results
60
+ end
61
+
57
62
  private
58
63
  def tagger
59
64
  $rtagger
data/website/index.html CHANGED
@@ -2,30 +2,157 @@
2
2
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
3
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
4
  <head>
5
- <link rel="stylesheet" href="stylesheets/screen.css" type="text/css" media="screen" />
6
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7
- <title>
8
- rbtagger
9
- </title>
10
- <script src="javascripts/rounded_corners_lite.inc.js" type="text/javascript"></script>
11
- <style>
12
-
13
- </style>
14
- <script type="text/javascript">
15
- window.onload = function() {
16
- settings = {
17
- tl: { radius: 10 },
18
- tr: { radius: 10 },
19
- bl: { radius: 10 },
20
- br: { radius: 10 },
21
- antiAlias: true,
22
- autoPad: true,
23
- validTags: ["div"]
24
- }
25
- var versionBox = new curvyCorners(settings, document.getElementById("version"));
26
- versionBox.applyCornersToAll();
27
- }
28
- </script>
6
+ <title>rbtagger</title>
7
+ <style type="text/css">
8
+ body {
9
+ background-color: #F1F1F1;
10
+ font-family: "Georgia", sans-serif;
11
+ font-size: 16px;
12
+ line-height: 1.6em;
13
+ padding: 1.6em 0 0 0;
14
+ color: #333;
15
+ }
16
+ h1, h2, h3, h4, h5, h6 {
17
+ color: #444;
18
+ }
19
+ h1 {
20
+ font-family: sans-serif;
21
+ font-weight: normal;
22
+ font-size: 4em;
23
+ line-height: 0.8em;
24
+ letter-spacing: -0.1ex;
25
+ margin: 5px;
26
+ }
27
+ li {
28
+ padding: 0;
29
+ margin: 0;
30
+ list-style-type: square;
31
+ }
32
+ a {
33
+ color: #5E5AFF;
34
+ background-color: #DAC;
35
+ font-weight: normal;
36
+ text-decoration: underline;
37
+ }
38
+ blockquote {
39
+ font-size: 90%;
40
+ font-style: italic;
41
+ border-left: 1px solid #111;
42
+ padding-left: 1em;
43
+ }
44
+ .caps {
45
+ font-size: 80%;
46
+ }
47
+
48
+ #main {
49
+ width: 45em;
50
+ padding: 0;
51
+ margin: 0 auto;
52
+ }
53
+ .coda {
54
+ text-align: right;
55
+ color: #77f;
56
+ font-size: smaller;
57
+ }
58
+
59
+ table {
60
+ font-size: 90%;
61
+ line-height: 1.4em;
62
+ color: #ff8;
63
+ background-color: #111;
64
+ padding: 2px 10px 2px 10px;
65
+ border-style: dashed;
66
+ }
67
+
68
+ th {
69
+ color: #fff;
70
+ }
71
+
72
+ td {
73
+ padding: 2px 10px 2px 10px;
74
+ }
75
+
76
+ .success {
77
+ color: #0CC52B;
78
+ }
79
+
80
+ .failed {
81
+ color: #E90A1B;
82
+ }
83
+
84
+ .unknown {
85
+ color: #995000;
86
+ }
87
+ pre, code {
88
+ font-family: monospace;
89
+ font-size: 90%;
90
+ line-height: 1.4em;
91
+ color: #ff8;
92
+ background-color: #111;
93
+ padding: 2px 10px 2px 10px;
94
+ }
95
+ .comment { color: #aaa; font-style: italic; }
96
+ .keyword { color: #eff; font-weight: bold; }
97
+ .punct { color: #eee; font-weight: bold; }
98
+ .symbol { color: #0bb; }
99
+ .string { color: #6b4; }
100
+ .ident { color: #ff8; }
101
+ .constant { color: #66f; }
102
+ .regex { color: #ec6; }
103
+ .number { color: #F99; }
104
+ .expr { color: #227; }
105
+
106
+ #version {
107
+ float: right;
108
+ text-align: right;
109
+ font-family: sans-serif;
110
+ font-weight: normal;
111
+ background-color: #B3ABFF;
112
+ color: #141331;
113
+ padding: 15px 20px 10px 20px;
114
+ margin: 0 auto;
115
+ margin-top: 15px;
116
+ border: 3px solid #141331;
117
+ display:block;
118
+ -moz-border-radius-bottomleft:10px;
119
+ -moz-border-radius-bottomright:10px;
120
+ -moz-border-radius-topleft:10px;
121
+ -moz-border-radius-topright:10px;
122
+ -webkit-border-bottom-left-radius:10px;
123
+ -webkit-border-bottom-right-radius:10px;
124
+ -webkit-border-top-left-radius:10px;
125
+ -webkit-border-top-right-radius:10px;
126
+ }
127
+
128
+ #version .numbers {
129
+ display: block;
130
+ font-size: 4em;
131
+ line-height: 0.8em;
132
+ letter-spacing: -0.1ex;
133
+ margin-bottom: 15px;
134
+ }
135
+
136
+ #version p {
137
+ text-decoration: none;
138
+ color: #141331;
139
+ background-color: #B3ABFF;
140
+ margin: 0;
141
+ padding: 0;
142
+ }
143
+
144
+ #version a {
145
+ text-decoration: none;
146
+ color: #141331;
147
+ background-color: #B3ABFF;
148
+ }
149
+
150
+ .clickable {
151
+ cursor: pointer;
152
+ cursor: hand;
153
+ }
154
+
155
+ </style>
29
156
  </head>
30
157
  <body>
31
158
  <div id="main">
@@ -33,7 +160,7 @@
33
160
  <h1>rbtagger</h1>
34
161
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/rbtagger"; return false'>
35
162
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.3</a>
163
+ <a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.4</a>
37
164
  </div>
38
165
  <h4 style="float:right;padding-right:10px;"> &#x2192; &#8216;rbtagger&#8217;</h4>
39
166
 
@@ -50,33 +177,33 @@
50
177
 
51
178
 
52
179
  <p><pre class='syntax'>
53
- wget http://rubyforge.org/frs/download.php/37542/rbtagger-0.2.3.gem
54
- gem install rbtagger-0.2.3.gem
180
+ gem install rbtagger
55
181
  </pre></p>
56
182
 
57
183
 
58
184
  <h2>The basics</h2>
59
185
 
60
186
 
61
- <p><pre class='syntax'>
187
+ <h4>Using the rule tagger</h4>
188
+ <pre class='syntax'>
62
189
  <span class="ident">require</span> <span class="punct">'</span><span class="string">rbtagger</span><span class="punct">'</span>
63
190
 
64
- <span class="comment"># Using the rule tagger</span>
65
191
  <span class="ident">tagger</span> <span class="punct">=</span> <span class="constant">Brill</span><span class="punct">::</span><span class="constant">Tagger</span><span class="punct">.</span><span class="ident">new</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">LEXICON</span><span class="punct">&quot;),</span>
66
192
  <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">LEXICALRULEFILE</span><span class="punct">&quot;),</span>
67
193
  <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">CONTEXTUALRULEFILE</span><span class="punct">&quot;)</span> <span class="punct">)</span>
68
194
  <span class="ident">docs</span><span class="punct">.</span><span class="ident">each</span> <span class="keyword">do</span><span class="punct">|</span><span class="ident">doc</span><span class="punct">|</span>
69
195
  <span class="ident">tagger</span><span class="punct">.</span><span class="ident">tag</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">read</span><span class="punct">(</span> <span class="ident">doc</span> <span class="punct">)</span> <span class="punct">)</span>
70
196
  <span class="keyword">end</span>
197
+ </pre>
198
+
199
+ <h4>Using the word tagger</h4>
200
+ <pre class='syntax'>
201
+ <span class="ident">require</span> <span class="punct">'</span><span class="string">rbtagger</span><span class="punct">'</span>
71
202
 
72
- <span class="comment"># Using the word tagger</span>
73
203
  <span class="ident">tagger</span> <span class="punct">=</span> <span class="constant">Word</span><span class="punct">::</span><span class="constant">Tagger</span><span class="punct">.</span><span class="ident">new</span><span class="punct">(</span> <span class="punct">['</span><span class="string">cat</span><span class="punct">','</span><span class="string">hat</span><span class="punct">'],</span> <span class="symbol">:words</span> <span class="punct">=&gt;</span> <span class="number">4</span> <span class="punct">)</span>
74
204
  <span class="ident">tags</span> <span class="punct">=</span> <span class="ident">tagger</span><span class="punct">.</span><span class="ident">execute</span><span class="punct">(</span> <span class="punct">'</span><span class="string">the cat and the hat</span><span class="punct">'</span> <span class="punct">)</span>
75
205
  <span class="ident">assert_equal</span><span class="punct">(</span> <span class="punct">[&quot;</span><span class="string">cat</span><span class="punct">&quot;,</span> <span class="punct">&quot;</span><span class="string">hat</span><span class="punct">&quot;],</span> <span class="ident">tags</span> <span class="punct">)</span>
76
-
77
-
78
- </pre></p>
79
-
206
+ </pre>
80
207
 
81
208
  <h2>Forum</h2>
82
209
 
@@ -115,12 +242,21 @@ rake install_gem</pre>
115
242
 
116
243
  <p>Comments are welcome. Send an email to <a href="mailto:rb-brill-tagger@googlegroups.com">Todd A. Fisher</a> email via the <a href="http://groups.google.com/group/rb-brill-tagger">forum</a></p>
117
244
  <p class="coda">
118
- <a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>, 27th May 2008<br>
245
+ <a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>, 22nd June 2008<br>
119
246
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
120
247
  </p>
121
248
  </div>
122
249
 
123
250
  <!-- insert site tracking codes here, like Google Urchin -->
251
+ <script type="text/javascript">
252
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
253
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
254
+ </script>
255
+ <script type="text/javascript">
256
+ var pageTracker = _gat._getTracker("UA-246931-6");
257
+ pageTracker._initData();
258
+ pageTracker._trackPageview();
259
+ </script>
124
260
 
125
261
  </body>
126
262
  </html>
data/website/index.txt CHANGED
@@ -12,16 +12,15 @@ This work is based on the work of Eric Brill
12
12
  h2. Installing
13
13
 
14
14
  <pre syntax="bash">
15
- wget http://rubyforge.org/frs/download.php/37542/rbtagger-0.2.3.gem
16
- gem install rbtagger-0.2.3.gem
15
+ gem install rbtagger
17
16
  </pre>
18
17
 
19
18
  h2. The basics
20
19
 
20
+ <h4>Using the rule tagger</h4>
21
21
  <pre syntax="ruby">
22
22
  require 'rbtagger'
23
23
 
24
- # Using the rule tagger
25
24
  tagger = Brill::Tagger.new( File.join(File.dirname(__FILE__),"LEXICON"),
26
25
  File.join(File.dirname(__FILE__),"LEXICALRULEFILE"),
27
26
  File.join(File.dirname(__FILE__),"CONTEXTUALRULEFILE") )
@@ -29,12 +28,17 @@ docs.each do|doc|
29
28
  tagger.tag( File.read( doc ) )
30
29
  end
31
30
 
32
- # Using the word tagger
31
+ tagger.suggest( File.read("sample.txt") )
32
+ => [["doctor", "NN", 3], ["treatment", "NN", 5]]
33
+ </pre>
34
+
35
+ <h4>Using the word tagger</h4>
36
+ <pre syntax="ruby">
37
+ require 'rbtagger'
38
+
33
39
  tagger = Word::Tagger.new( ['cat','hat'], :words => 4 )
34
40
  tags = tagger.execute( 'the cat and the hat' )
35
41
  assert_equal( ["cat", "hat"], tags )
36
-
37
-
38
42
  </pre>
39
43
 
40
44
  h2. Forum
@@ -2,30 +2,157 @@
2
2
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
3
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
4
  <head>
5
- <link rel="stylesheet" href="stylesheets/screen.css" type="text/css" media="screen" />
6
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7
- <title>
8
- <%= title %>
9
- </title>
10
- <script src="javascripts/rounded_corners_lite.inc.js" type="text/javascript"></script>
11
- <style>
12
-
13
- </style>
14
- <script type="text/javascript">
15
- window.onload = function() {
16
- settings = {
17
- tl: { radius: 10 },
18
- tr: { radius: 10 },
19
- bl: { radius: 10 },
20
- br: { radius: 10 },
21
- antiAlias: true,
22
- autoPad: true,
23
- validTags: ["div"]
24
- }
25
- var versionBox = new curvyCorners(settings, document.getElementById("version"));
26
- versionBox.applyCornersToAll();
27
- }
28
- </script>
6
+ <title><%= title %></title>
7
+ <style type="text/css">
8
+ body {
9
+ background-color: #F1F1F1;
10
+ font-family: "Georgia", sans-serif;
11
+ font-size: 16px;
12
+ line-height: 1.6em;
13
+ padding: 1.6em 0 0 0;
14
+ color: #333;
15
+ }
16
+ h1, h2, h3, h4, h5, h6 {
17
+ color: #444;
18
+ }
19
+ h1 {
20
+ font-family: sans-serif;
21
+ font-weight: normal;
22
+ font-size: 4em;
23
+ line-height: 0.8em;
24
+ letter-spacing: -0.1ex;
25
+ margin: 5px;
26
+ }
27
+ li {
28
+ padding: 0;
29
+ margin: 0;
30
+ list-style-type: square;
31
+ }
32
+ a {
33
+ color: #5E5AFF;
34
+ background-color: #DAC;
35
+ font-weight: normal;
36
+ text-decoration: underline;
37
+ }
38
+ blockquote {
39
+ font-size: 90%;
40
+ font-style: italic;
41
+ border-left: 1px solid #111;
42
+ padding-left: 1em;
43
+ }
44
+ .caps {
45
+ font-size: 80%;
46
+ }
47
+
48
+ #main {
49
+ width: 45em;
50
+ padding: 0;
51
+ margin: 0 auto;
52
+ }
53
+ .coda {
54
+ text-align: right;
55
+ color: #77f;
56
+ font-size: smaller;
57
+ }
58
+
59
+ table {
60
+ font-size: 90%;
61
+ line-height: 1.4em;
62
+ color: #ff8;
63
+ background-color: #111;
64
+ padding: 2px 10px 2px 10px;
65
+ border-style: dashed;
66
+ }
67
+
68
+ th {
69
+ color: #fff;
70
+ }
71
+
72
+ td {
73
+ padding: 2px 10px 2px 10px;
74
+ }
75
+
76
+ .success {
77
+ color: #0CC52B;
78
+ }
79
+
80
+ .failed {
81
+ color: #E90A1B;
82
+ }
83
+
84
+ .unknown {
85
+ color: #995000;
86
+ }
87
+ pre, code {
88
+ font-family: monospace;
89
+ font-size: 90%;
90
+ line-height: 1.4em;
91
+ color: #ff8;
92
+ background-color: #111;
93
+ padding: 2px 10px 2px 10px;
94
+ }
95
+ .comment { color: #aaa; font-style: italic; }
96
+ .keyword { color: #eff; font-weight: bold; }
97
+ .punct { color: #eee; font-weight: bold; }
98
+ .symbol { color: #0bb; }
99
+ .string { color: #6b4; }
100
+ .ident { color: #ff8; }
101
+ .constant { color: #66f; }
102
+ .regex { color: #ec6; }
103
+ .number { color: #F99; }
104
+ .expr { color: #227; }
105
+
106
+ #version {
107
+ float: right;
108
+ text-align: right;
109
+ font-family: sans-serif;
110
+ font-weight: normal;
111
+ background-color: #B3ABFF;
112
+ color: #141331;
113
+ padding: 15px 20px 10px 20px;
114
+ margin: 0 auto;
115
+ margin-top: 15px;
116
+ border: 3px solid #141331;
117
+ display:block;
118
+ -moz-border-radius-bottomleft:10px;
119
+ -moz-border-radius-bottomright:10px;
120
+ -moz-border-radius-topleft:10px;
121
+ -moz-border-radius-topright:10px;
122
+ -webkit-border-bottom-left-radius:10px;
123
+ -webkit-border-bottom-right-radius:10px;
124
+ -webkit-border-top-left-radius:10px;
125
+ -webkit-border-top-right-radius:10px;
126
+ }
127
+
128
+ #version .numbers {
129
+ display: block;
130
+ font-size: 4em;
131
+ line-height: 0.8em;
132
+ letter-spacing: -0.1ex;
133
+ margin-bottom: 15px;
134
+ }
135
+
136
+ #version p {
137
+ text-decoration: none;
138
+ color: #141331;
139
+ background-color: #B3ABFF;
140
+ margin: 0;
141
+ padding: 0;
142
+ }
143
+
144
+ #version a {
145
+ text-decoration: none;
146
+ color: #141331;
147
+ background-color: #B3ABFF;
148
+ }
149
+
150
+ .clickable {
151
+ cursor: pointer;
152
+ cursor: hand;
153
+ }
154
+
155
+ </style>
29
156
  </head>
30
157
  <body>
31
158
  <div id="main">
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Todd A. Fisher
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-06-22 00:00:00 -04:00
12
+ date: 2008-06-23 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies: []
15
15