rbtagger 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/brill/tagger.rb CHANGED
@@ -9,6 +9,34 @@ module Brill
9
9
  Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
10
10
  end
11
11
 
12
+ # returns similar results as tag, but further reduced by only selecting nouns
13
+ def suggest( text, max = 10 )
14
+ tags = tag(text)
15
+ results = tags.select{|tag| tag.last.match(/NN/) }
16
+ if results.size > max
17
+ counts = {}
18
+ tags = []
19
+ results.each {|tag| counts[tag.first] = 0 }
20
+ results.each do |tag|
21
+ counts[tag.first] += 1
22
+ tags << tag if counts[tag.first] == 1
23
+ end
24
+ tags.map!{|tag| [tag.first, tag.last,counts[tag.first]]}
25
+ t = 1
26
+ until tags.size <= max
27
+ tags = tags.sort_by{|tag| tag.last}.select{|tag| tag.last > t }
28
+ t += 1
29
+ if t == 5
30
+ tags = tags.reverse[0..max]
31
+ break
32
+ end
33
+ end
34
+ tags
35
+ else
36
+ results
37
+ end
38
+ end
39
+
12
40
  # Tag a body of text
13
41
  # returns an array like [[token,tag],[token,tag]...[token,tag]]
14
42
  #
@@ -2,7 +2,7 @@ module RbTagger #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 4
5
+ TINY = 5
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/test/test_helper.rb CHANGED
@@ -1,2 +1,5 @@
1
1
  require 'test/unit'
2
- require File.dirname(__FILE__) + '/../lib/rbtagger'
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','ext','rule_tagger')
3
+ $:.unshift File.join(File.dirname(__FILE__),'..','ext','word_tagger')
4
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
5
+ require 'rbtagger'
@@ -54,6 +54,11 @@ Although many newly diagnosed patients fear they will not be able to keep workin
54
54
  puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
55
55
  end
56
56
 
57
+ def test_suggest
58
+ results = tagger.suggest( SAMPLE_DOC )
59
+ assert_equal [["doctor", "NN", 3], ["treatment", "NN", 5]], results
60
+ end
61
+
57
62
  private
58
63
  def tagger
59
64
  $rtagger
data/website/index.html CHANGED
@@ -2,30 +2,157 @@
2
2
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
3
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
4
  <head>
5
- <link rel="stylesheet" href="stylesheets/screen.css" type="text/css" media="screen" />
6
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7
- <title>
8
- rbtagger
9
- </title>
10
- <script src="javascripts/rounded_corners_lite.inc.js" type="text/javascript"></script>
11
- <style>
12
-
13
- </style>
14
- <script type="text/javascript">
15
- window.onload = function() {
16
- settings = {
17
- tl: { radius: 10 },
18
- tr: { radius: 10 },
19
- bl: { radius: 10 },
20
- br: { radius: 10 },
21
- antiAlias: true,
22
- autoPad: true,
23
- validTags: ["div"]
24
- }
25
- var versionBox = new curvyCorners(settings, document.getElementById("version"));
26
- versionBox.applyCornersToAll();
27
- }
28
- </script>
6
+ <title>rbtagger</title>
7
+ <style type="text/css">
8
+ body {
9
+ background-color: #F1F1F1;
10
+ font-family: "Georgia", sans-serif;
11
+ font-size: 16px;
12
+ line-height: 1.6em;
13
+ padding: 1.6em 0 0 0;
14
+ color: #333;
15
+ }
16
+ h1, h2, h3, h4, h5, h6 {
17
+ color: #444;
18
+ }
19
+ h1 {
20
+ font-family: sans-serif;
21
+ font-weight: normal;
22
+ font-size: 4em;
23
+ line-height: 0.8em;
24
+ letter-spacing: -0.1ex;
25
+ margin: 5px;
26
+ }
27
+ li {
28
+ padding: 0;
29
+ margin: 0;
30
+ list-style-type: square;
31
+ }
32
+ a {
33
+ color: #5E5AFF;
34
+ background-color: #DAC;
35
+ font-weight: normal;
36
+ text-decoration: underline;
37
+ }
38
+ blockquote {
39
+ font-size: 90%;
40
+ font-style: italic;
41
+ border-left: 1px solid #111;
42
+ padding-left: 1em;
43
+ }
44
+ .caps {
45
+ font-size: 80%;
46
+ }
47
+
48
+ #main {
49
+ width: 45em;
50
+ padding: 0;
51
+ margin: 0 auto;
52
+ }
53
+ .coda {
54
+ text-align: right;
55
+ color: #77f;
56
+ font-size: smaller;
57
+ }
58
+
59
+ table {
60
+ font-size: 90%;
61
+ line-height: 1.4em;
62
+ color: #ff8;
63
+ background-color: #111;
64
+ padding: 2px 10px 2px 10px;
65
+ border-style: dashed;
66
+ }
67
+
68
+ th {
69
+ color: #fff;
70
+ }
71
+
72
+ td {
73
+ padding: 2px 10px 2px 10px;
74
+ }
75
+
76
+ .success {
77
+ color: #0CC52B;
78
+ }
79
+
80
+ .failed {
81
+ color: #E90A1B;
82
+ }
83
+
84
+ .unknown {
85
+ color: #995000;
86
+ }
87
+ pre, code {
88
+ font-family: monospace;
89
+ font-size: 90%;
90
+ line-height: 1.4em;
91
+ color: #ff8;
92
+ background-color: #111;
93
+ padding: 2px 10px 2px 10px;
94
+ }
95
+ .comment { color: #aaa; font-style: italic; }
96
+ .keyword { color: #eff; font-weight: bold; }
97
+ .punct { color: #eee; font-weight: bold; }
98
+ .symbol { color: #0bb; }
99
+ .string { color: #6b4; }
100
+ .ident { color: #ff8; }
101
+ .constant { color: #66f; }
102
+ .regex { color: #ec6; }
103
+ .number { color: #F99; }
104
+ .expr { color: #227; }
105
+
106
+ #version {
107
+ float: right;
108
+ text-align: right;
109
+ font-family: sans-serif;
110
+ font-weight: normal;
111
+ background-color: #B3ABFF;
112
+ color: #141331;
113
+ padding: 15px 20px 10px 20px;
114
+ margin: 0 auto;
115
+ margin-top: 15px;
116
+ border: 3px solid #141331;
117
+ display:block;
118
+ -moz-border-radius-bottomleft:10px;
119
+ -moz-border-radius-bottomright:10px;
120
+ -moz-border-radius-topleft:10px;
121
+ -moz-border-radius-topright:10px;
122
+ -webkit-border-bottom-left-radius:10px;
123
+ -webkit-border-bottom-right-radius:10px;
124
+ -webkit-border-top-left-radius:10px;
125
+ -webkit-border-top-right-radius:10px;
126
+ }
127
+
128
+ #version .numbers {
129
+ display: block;
130
+ font-size: 4em;
131
+ line-height: 0.8em;
132
+ letter-spacing: -0.1ex;
133
+ margin-bottom: 15px;
134
+ }
135
+
136
+ #version p {
137
+ text-decoration: none;
138
+ color: #141331;
139
+ background-color: #B3ABFF;
140
+ margin: 0;
141
+ padding: 0;
142
+ }
143
+
144
+ #version a {
145
+ text-decoration: none;
146
+ color: #141331;
147
+ background-color: #B3ABFF;
148
+ }
149
+
150
+ .clickable {
151
+ cursor: pointer;
152
+ cursor: hand;
153
+ }
154
+
155
+ </style>
29
156
  </head>
30
157
  <body>
31
158
  <div id="main">
@@ -33,7 +160,7 @@
33
160
  <h1>rbtagger</h1>
34
161
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/rbtagger"; return false'>
35
162
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.3</a>
163
+ <a href="http://rubyforge.org/projects/rbtagger" class="numbers">0.2.4</a>
37
164
  </div>
38
165
  <h4 style="float:right;padding-right:10px;"> &#x2192; &#8216;rbtagger&#8217;</h4>
39
166
 
@@ -50,33 +177,33 @@
50
177
 
51
178
 
52
179
  <p><pre class='syntax'>
53
- wget http://rubyforge.org/frs/download.php/37542/rbtagger-0.2.3.gem
54
- gem install rbtagger-0.2.3.gem
180
+ gem install rbtagger
55
181
  </pre></p>
56
182
 
57
183
 
58
184
  <h2>The basics</h2>
59
185
 
60
186
 
61
- <p><pre class='syntax'>
187
+ <h4>Using the rule tagger</h4>
188
+ <pre class='syntax'>
62
189
  <span class="ident">require</span> <span class="punct">'</span><span class="string">rbtagger</span><span class="punct">'</span>
63
190
 
64
- <span class="comment"># Using the rule tagger</span>
65
191
  <span class="ident">tagger</span> <span class="punct">=</span> <span class="constant">Brill</span><span class="punct">::</span><span class="constant">Tagger</span><span class="punct">.</span><span class="ident">new</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">LEXICON</span><span class="punct">&quot;),</span>
66
192
  <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">LEXICALRULEFILE</span><span class="punct">&quot;),</span>
67
193
  <span class="constant">File</span><span class="punct">.</span><span class="ident">join</span><span class="punct">(</span><span class="constant">File</span><span class="punct">.</span><span class="ident">dirname</span><span class="punct">(</span><span class="constant">__FILE__</span><span class="punct">),&quot;</span><span class="string">CONTEXTUALRULEFILE</span><span class="punct">&quot;)</span> <span class="punct">)</span>
68
194
  <span class="ident">docs</span><span class="punct">.</span><span class="ident">each</span> <span class="keyword">do</span><span class="punct">|</span><span class="ident">doc</span><span class="punct">|</span>
69
195
  <span class="ident">tagger</span><span class="punct">.</span><span class="ident">tag</span><span class="punct">(</span> <span class="constant">File</span><span class="punct">.</span><span class="ident">read</span><span class="punct">(</span> <span class="ident">doc</span> <span class="punct">)</span> <span class="punct">)</span>
70
196
  <span class="keyword">end</span>
197
+ </pre>
198
+
199
+ <h4>Using the word tagger</h4>
200
+ <pre class='syntax'>
201
+ <span class="ident">require</span> <span class="punct">'</span><span class="string">rbtagger</span><span class="punct">'</span>
71
202
 
72
- <span class="comment"># Using the word tagger</span>
73
203
  <span class="ident">tagger</span> <span class="punct">=</span> <span class="constant">Word</span><span class="punct">::</span><span class="constant">Tagger</span><span class="punct">.</span><span class="ident">new</span><span class="punct">(</span> <span class="punct">['</span><span class="string">cat</span><span class="punct">','</span><span class="string">hat</span><span class="punct">'],</span> <span class="symbol">:words</span> <span class="punct">=&gt;</span> <span class="number">4</span> <span class="punct">)</span>
74
204
  <span class="ident">tags</span> <span class="punct">=</span> <span class="ident">tagger</span><span class="punct">.</span><span class="ident">execute</span><span class="punct">(</span> <span class="punct">'</span><span class="string">the cat and the hat</span><span class="punct">'</span> <span class="punct">)</span>
75
205
  <span class="ident">assert_equal</span><span class="punct">(</span> <span class="punct">[&quot;</span><span class="string">cat</span><span class="punct">&quot;,</span> <span class="punct">&quot;</span><span class="string">hat</span><span class="punct">&quot;],</span> <span class="ident">tags</span> <span class="punct">)</span>
76
-
77
-
78
- </pre></p>
79
-
206
+ </pre>
80
207
 
81
208
  <h2>Forum</h2>
82
209
 
@@ -115,12 +242,21 @@ rake install_gem</pre>
115
242
 
116
243
  <p>Comments are welcome. Send an email to <a href="mailto:rb-brill-tagger@googlegroups.com">Todd A. Fisher</a> email via the <a href="http://groups.google.com/group/rb-brill-tagger">forum</a></p>
117
244
  <p class="coda">
118
- <a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>, 27th May 2008<br>
245
+ <a href="http://xullicious.blogspot.com/">Todd A. Fisher</a>, 22nd June 2008<br>
119
246
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
120
247
  </p>
121
248
  </div>
122
249
 
123
250
  <!-- insert site tracking codes here, like Google Urchin -->
251
+ <script type="text/javascript">
252
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
253
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
254
+ </script>
255
+ <script type="text/javascript">
256
+ var pageTracker = _gat._getTracker("UA-246931-6");
257
+ pageTracker._initData();
258
+ pageTracker._trackPageview();
259
+ </script>
124
260
 
125
261
  </body>
126
262
  </html>
data/website/index.txt CHANGED
@@ -12,16 +12,15 @@ This work is based on the work of Eric Brill
12
12
  h2. Installing
13
13
 
14
14
  <pre syntax="bash">
15
- wget http://rubyforge.org/frs/download.php/37542/rbtagger-0.2.3.gem
16
- gem install rbtagger-0.2.3.gem
15
+ gem install rbtagger
17
16
  </pre>
18
17
 
19
18
  h2. The basics
20
19
 
20
+ <h4>Using the rule tagger</h4>
21
21
  <pre syntax="ruby">
22
22
  require 'rbtagger'
23
23
 
24
- # Using the rule tagger
25
24
  tagger = Brill::Tagger.new( File.join(File.dirname(__FILE__),"LEXICON"),
26
25
  File.join(File.dirname(__FILE__),"LEXICALRULEFILE"),
27
26
  File.join(File.dirname(__FILE__),"CONTEXTUALRULEFILE") )
@@ -29,12 +28,17 @@ docs.each do|doc|
29
28
  tagger.tag( File.read( doc ) )
30
29
  end
31
30
 
32
- # Using the word tagger
31
+ tagger.suggest( File.read("sample.txt") )
32
+ => [["doctor", "NN", 3], ["treatment", "NN", 5]]
33
+ </pre>
34
+
35
+ <h4>Using the word tagger</h4>
36
+ <pre syntax="ruby">
37
+ require 'rbtagger'
38
+
33
39
  tagger = Word::Tagger.new( ['cat','hat'], :words => 4 )
34
40
  tags = tagger.execute( 'the cat and the hat' )
35
41
  assert_equal( ["cat", "hat"], tags )
36
-
37
-
38
42
  </pre>
39
43
 
40
44
  h2. Forum
@@ -2,30 +2,157 @@
2
2
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
3
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
4
  <head>
5
- <link rel="stylesheet" href="stylesheets/screen.css" type="text/css" media="screen" />
6
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7
- <title>
8
- <%= title %>
9
- </title>
10
- <script src="javascripts/rounded_corners_lite.inc.js" type="text/javascript"></script>
11
- <style>
12
-
13
- </style>
14
- <script type="text/javascript">
15
- window.onload = function() {
16
- settings = {
17
- tl: { radius: 10 },
18
- tr: { radius: 10 },
19
- bl: { radius: 10 },
20
- br: { radius: 10 },
21
- antiAlias: true,
22
- autoPad: true,
23
- validTags: ["div"]
24
- }
25
- var versionBox = new curvyCorners(settings, document.getElementById("version"));
26
- versionBox.applyCornersToAll();
27
- }
28
- </script>
6
+ <title><%= title %></title>
7
+ <style type="text/css">
8
+ body {
9
+ background-color: #F1F1F1;
10
+ font-family: "Georgia", sans-serif;
11
+ font-size: 16px;
12
+ line-height: 1.6em;
13
+ padding: 1.6em 0 0 0;
14
+ color: #333;
15
+ }
16
+ h1, h2, h3, h4, h5, h6 {
17
+ color: #444;
18
+ }
19
+ h1 {
20
+ font-family: sans-serif;
21
+ font-weight: normal;
22
+ font-size: 4em;
23
+ line-height: 0.8em;
24
+ letter-spacing: -0.1ex;
25
+ margin: 5px;
26
+ }
27
+ li {
28
+ padding: 0;
29
+ margin: 0;
30
+ list-style-type: square;
31
+ }
32
+ a {
33
+ color: #5E5AFF;
34
+ background-color: #DAC;
35
+ font-weight: normal;
36
+ text-decoration: underline;
37
+ }
38
+ blockquote {
39
+ font-size: 90%;
40
+ font-style: italic;
41
+ border-left: 1px solid #111;
42
+ padding-left: 1em;
43
+ }
44
+ .caps {
45
+ font-size: 80%;
46
+ }
47
+
48
+ #main {
49
+ width: 45em;
50
+ padding: 0;
51
+ margin: 0 auto;
52
+ }
53
+ .coda {
54
+ text-align: right;
55
+ color: #77f;
56
+ font-size: smaller;
57
+ }
58
+
59
+ table {
60
+ font-size: 90%;
61
+ line-height: 1.4em;
62
+ color: #ff8;
63
+ background-color: #111;
64
+ padding: 2px 10px 2px 10px;
65
+ border-style: dashed;
66
+ }
67
+
68
+ th {
69
+ color: #fff;
70
+ }
71
+
72
+ td {
73
+ padding: 2px 10px 2px 10px;
74
+ }
75
+
76
+ .success {
77
+ color: #0CC52B;
78
+ }
79
+
80
+ .failed {
81
+ color: #E90A1B;
82
+ }
83
+
84
+ .unknown {
85
+ color: #995000;
86
+ }
87
+ pre, code {
88
+ font-family: monospace;
89
+ font-size: 90%;
90
+ line-height: 1.4em;
91
+ color: #ff8;
92
+ background-color: #111;
93
+ padding: 2px 10px 2px 10px;
94
+ }
95
+ .comment { color: #aaa; font-style: italic; }
96
+ .keyword { color: #eff; font-weight: bold; }
97
+ .punct { color: #eee; font-weight: bold; }
98
+ .symbol { color: #0bb; }
99
+ .string { color: #6b4; }
100
+ .ident { color: #ff8; }
101
+ .constant { color: #66f; }
102
+ .regex { color: #ec6; }
103
+ .number { color: #F99; }
104
+ .expr { color: #227; }
105
+
106
+ #version {
107
+ float: right;
108
+ text-align: right;
109
+ font-family: sans-serif;
110
+ font-weight: normal;
111
+ background-color: #B3ABFF;
112
+ color: #141331;
113
+ padding: 15px 20px 10px 20px;
114
+ margin: 0 auto;
115
+ margin-top: 15px;
116
+ border: 3px solid #141331;
117
+ display:block;
118
+ -moz-border-radius-bottomleft:10px;
119
+ -moz-border-radius-bottomright:10px;
120
+ -moz-border-radius-topleft:10px;
121
+ -moz-border-radius-topright:10px;
122
+ -webkit-border-bottom-left-radius:10px;
123
+ -webkit-border-bottom-right-radius:10px;
124
+ -webkit-border-top-left-radius:10px;
125
+ -webkit-border-top-right-radius:10px;
126
+ }
127
+
128
+ #version .numbers {
129
+ display: block;
130
+ font-size: 4em;
131
+ line-height: 0.8em;
132
+ letter-spacing: -0.1ex;
133
+ margin-bottom: 15px;
134
+ }
135
+
136
+ #version p {
137
+ text-decoration: none;
138
+ color: #141331;
139
+ background-color: #B3ABFF;
140
+ margin: 0;
141
+ padding: 0;
142
+ }
143
+
144
+ #version a {
145
+ text-decoration: none;
146
+ color: #141331;
147
+ background-color: #B3ABFF;
148
+ }
149
+
150
+ .clickable {
151
+ cursor: pointer;
152
+ cursor: hand;
153
+ }
154
+
155
+ </style>
29
156
  </head>
30
157
  <body>
31
158
  <div id="main">
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Todd A. Fisher
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-06-22 00:00:00 -04:00
12
+ date: 2008-06-23 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies: []
15
15