ankusa 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::NBClass</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::NBClass</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/nbclass_rb.html">
59
+ lib/ankusa/nbclass.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000014">new</a>&nbsp;&nbsp;
90
+ </div>
91
+ </div>
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+ <div id="attribute-list">
105
+ <h3 class="section-bar">Attributes</h3>
106
+
107
+ <div class="name-list">
108
+ <table>
109
+ <tr class="top-aligned-row context-row">
110
+ <td class="context-item-name">doc_count</td>
111
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
112
+ <td class="context-item-desc"></td>
113
+ </tr>
114
+ <tr class="top-aligned-row context-row">
115
+ <td class="context-item-name">word_count</td>
116
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
117
+ <td class="context-item-desc"></td>
118
+ </tr>
119
+ </table>
120
+ </div>
121
+ </div>
122
+
123
+
124
+
125
+ <!-- if method_list -->
126
+ <div id="methods">
127
+ <h3 class="section-bar">Public Class methods</h3>
128
+
129
+ <div id="method-M000014" class="method-detail">
130
+ <a name="M000014"></a>
131
+
132
+ <div class="method-heading">
133
+ <a href="#M000014" class="method-signature">
134
+ <span class="method-name">new</span><span class="method-args">(name, summary_table, freq_table)</span>
135
+ </a>
136
+ </div>
137
+
138
+ <div class="method-description">
139
+ <p><a class="source-toggle" href="#"
140
+ onclick="toggleCode('M000014-source');return false;">[Source]</a></p>
141
+ <div class="method-source-code" id="M000014-source">
142
+ <pre>
143
+ <span class="ruby-comment cmt"># File lib/ankusa/nbclass.rb, line 6</span>
144
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">name</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>)
145
+ <span class="ruby-ivar">@name</span> = <span class="ruby-identifier">name</span>
146
+ <span class="ruby-ivar">@summary_table</span> = <span class="ruby-identifier">summary_table</span>
147
+ <span class="ruby-ivar">@freq_table</span> = <span class="ruby-identifier">freq_table</span>
148
+ <span class="ruby-ivar">@word_count</span> = <span class="ruby-ivar">@summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-ivar">@name</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
149
+ <span class="ruby-ivar">@doc_count</span> = <span class="ruby-ivar">@summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-ivar">@name</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
150
+ <span class="ruby-keyword kw">end</span>
151
+ </pre>
152
+ </div>
153
+ </div>
154
+ </div>
155
+
156
+
157
+ </div>
158
+
159
+
160
+ </div>
161
+
162
+
163
+ <div id="validator-badges">
164
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
165
+ </div>
166
+
167
+ </body>
168
+ </html>
@@ -0,0 +1,220 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::TextHash</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::TextHash</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/hasher_rb.html">
59
+ lib/ankusa/hasher.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Hash
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000016">add_text</a>&nbsp;&nbsp;
90
+ <a href="#M000017">add_word</a>&nbsp;&nbsp;
91
+ <a href="#M000015">new</a>&nbsp;&nbsp;
92
+ </div>
93
+ </div>
94
+
95
+ </div>
96
+
97
+
98
+ <!-- if includes -->
99
+
100
+ <div id="section">
101
+
102
+
103
+
104
+
105
+
106
+ <div id="attribute-list">
107
+ <h3 class="section-bar">Attributes</h3>
108
+
109
+ <div class="name-list">
110
+ <table>
111
+ <tr class="top-aligned-row context-row">
112
+ <td class="context-item-name">word_count</td>
113
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
114
+ <td class="context-item-desc"></td>
115
+ </tr>
116
+ </table>
117
+ </div>
118
+ </div>
119
+
120
+
121
+
122
+ <!-- if method_list -->
123
+ <div id="methods">
124
+ <h3 class="section-bar">Public Class methods</h3>
125
+
126
+ <div id="method-M000015" class="method-detail">
127
+ <a name="M000015"></a>
128
+
129
+ <div class="method-heading">
130
+ <a href="#M000015" class="method-signature">
131
+ <span class="method-name">new</span><span class="method-args">(text=nil)</span>
132
+ </a>
133
+ </div>
134
+
135
+ <div class="method-description">
136
+ <p><a class="source-toggle" href="#"
137
+ onclick="toggleCode('M000015-source');return false;">[Source]</a></p>
138
+ <div class="method-source-code" id="M000015-source">
139
+ <pre>
140
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
141
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
142
+ <span class="ruby-keyword kw">super</span> <span class="ruby-value">0</span>
143
+ <span class="ruby-ivar">@word_count</span> = <span class="ruby-value">0</span>
144
+ <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">nil?</span>
145
+ <span class="ruby-keyword kw">end</span>
146
+ </pre>
147
+ </div>
148
+ </div>
149
+ </div>
150
+
151
+ <h3 class="section-bar">Public Instance methods</h3>
152
+
153
+ <div id="method-M000016" class="method-detail">
154
+ <a name="M000016"></a>
155
+
156
+ <div class="method-heading">
157
+ <a href="#M000016" class="method-signature">
158
+ <span class="method-name">add_text</span><span class="method-args">(text)</span>
159
+ </a>
160
+ </div>
161
+
162
+ <div class="method-description">
163
+ <p><a class="source-toggle" href="#"
164
+ onclick="toggleCode('M000016-source');return false;">[Source]</a></p>
165
+ <div class="method-source-code" id="M000016-source">
166
+ <pre>
167
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
168
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
169
+ <span class="ruby-comment cmt"># replace dashes with spaces, then get rid of non-word/non-space characters, </span>
170
+ <span class="ruby-comment cmt"># then split by space to get words</span>
171
+ <span class="ruby-identifier">words</span> = <span class="ruby-identifier">text</span>.<span class="ruby-identifier">tr</span>(<span class="ruby-value str">'-'</span>, <span class="ruby-value str">' '</span>).<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[^\w\s]/</span>,<span class="ruby-value str">&quot;&quot;</span>).<span class="ruby-identifier">split</span>
172
+ <span class="ruby-identifier">words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_word</span> <span class="ruby-identifier">word</span> }
173
+ <span class="ruby-keyword kw">self</span>
174
+ <span class="ruby-keyword kw">end</span>
175
+ </pre>
176
+ </div>
177
+ </div>
178
+ </div>
179
+
180
+ <div id="method-M000017" class="method-detail">
181
+ <a name="M000017"></a>
182
+
183
+ <div class="method-heading">
184
+ <a href="#M000017" class="method-signature">
185
+ <span class="method-name">add_word</span><span class="method-args">(word)</span>
186
+ </a>
187
+ </div>
188
+
189
+ <div class="method-description">
190
+ <p><a class="source-toggle" href="#"
191
+ onclick="toggleCode('M000017-source');return false;">[Source]</a></p>
192
+ <div class="method-source-code" id="M000017-source">
193
+ <pre>
194
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 23</span>
195
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
196
+ <span class="ruby-identifier">word</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
197
+ <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">STOPWORDS</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">word</span>
198
+ <span class="ruby-ivar">@word_count</span> <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
199
+ <span class="ruby-identifier">key</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
200
+ <span class="ruby-identifier">store</span> <span class="ruby-identifier">key</span>, <span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">key</span>, <span class="ruby-value">0</span>)<span class="ruby-operator">+</span><span class="ruby-value">1</span>
201
+ <span class="ruby-keyword kw">end</span>
202
+ <span class="ruby-keyword kw">end</span>
203
+ </pre>
204
+ </div>
205
+ </div>
206
+ </div>
207
+
208
+
209
+ </div>
210
+
211
+
212
+ </div>
213
+
214
+
215
+ <div id="validator-badges">
216
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
217
+ </div>
218
+
219
+ </body>
220
+ </html>
data/docs/created.rid ADDED
@@ -0,0 +1 @@
1
+ Tue, 30 Nov 2010 16:28:42 -0500
@@ -0,0 +1,141 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>File: README.rdoc</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="fileHeader">
50
+ <h1>README.rdoc</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>README.rdoc
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Tue Nov 30 14:55:47 -0500 2010</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+ <div id="description">
72
+ <h1>ankusa</h1>
73
+ <p>
74
+ <a href="../classes/Ankusa.html">Ankusa</a> is a Naive Bayes classifier in
75
+ Ruby that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
76
+ backend, the training corpus can be many terabytes in size.
77
+ </p>
78
+ <h2>Installation</h2>
79
+ <p>
80
+ First, install HBase / Hadoop. Make sure the HBase Thrift interface has
81
+ been started as well. Then:
82
+ </p>
83
+ <pre>
84
+ gem install ankusa
85
+ </pre>
86
+ <h2>Basic Usage</h2>
87
+ <pre>
88
+ require 'rubygems'
89
+ require 'ankusa'
90
+ require 'hbaserb'
91
+
92
+ # connect to HBase
93
+ client = HBaseRb::Client.new 'localhost'
94
+
95
+ c = Classifier.new client
96
+ c.train :spam, &quot;This is some spammy text&quot;
97
+ c.train :good, &quot;This is not the bad stuff&quot;
98
+
99
+ # This will return the most likely class (as symbol)
100
+ puts c.classify &quot;This is some spammy text&quot;
101
+
102
+ # This will return Hash with classes as keys and
103
+ # membership probability as values
104
+ puts c.classifications &quot;This is some spammy text&quot;
105
+
106
+ # get a list of all classes
107
+ puts c.classes
108
+ </pre>
109
+
110
+ </div>
111
+
112
+
113
+ </div>
114
+
115
+
116
+ </div>
117
+
118
+
119
+ <!-- if includes -->
120
+
121
+ <div id="section">
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+ <!-- if method_list -->
131
+
132
+
133
+ </div>
134
+
135
+
136
+ <div id="validator-badges">
137
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
138
+ </div>
139
+
140
+ </body>
141
+ </html>