ankusa 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::NBClass</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::NBClass</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/nbclass_rb.html">
59
+ lib/ankusa/nbclass.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000014">new</a>&nbsp;&nbsp;
90
+ </div>
91
+ </div>
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+ <div id="attribute-list">
105
+ <h3 class="section-bar">Attributes</h3>
106
+
107
+ <div class="name-list">
108
+ <table>
109
+ <tr class="top-aligned-row context-row">
110
+ <td class="context-item-name">doc_count</td>
111
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
112
+ <td class="context-item-desc"></td>
113
+ </tr>
114
+ <tr class="top-aligned-row context-row">
115
+ <td class="context-item-name">word_count</td>
116
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
117
+ <td class="context-item-desc"></td>
118
+ </tr>
119
+ </table>
120
+ </div>
121
+ </div>
122
+
123
+
124
+
125
+ <!-- if method_list -->
126
+ <div id="methods">
127
+ <h3 class="section-bar">Public Class methods</h3>
128
+
129
+ <div id="method-M000014" class="method-detail">
130
+ <a name="M000014"></a>
131
+
132
+ <div class="method-heading">
133
+ <a href="#M000014" class="method-signature">
134
+ <span class="method-name">new</span><span class="method-args">(name, summary_table, freq_table)</span>
135
+ </a>
136
+ </div>
137
+
138
+ <div class="method-description">
139
+ <p><a class="source-toggle" href="#"
140
+ onclick="toggleCode('M000014-source');return false;">[Source]</a></p>
141
+ <div class="method-source-code" id="M000014-source">
142
+ <pre>
143
+ <span class="ruby-comment cmt"># File lib/ankusa/nbclass.rb, line 6</span>
144
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">name</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>)
145
+ <span class="ruby-ivar">@name</span> = <span class="ruby-identifier">name</span>
146
+ <span class="ruby-ivar">@summary_table</span> = <span class="ruby-identifier">summary_table</span>
147
+ <span class="ruby-ivar">@freq_table</span> = <span class="ruby-identifier">freq_table</span>
148
+ <span class="ruby-ivar">@word_count</span> = <span class="ruby-ivar">@summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-ivar">@name</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
149
+ <span class="ruby-ivar">@doc_count</span> = <span class="ruby-ivar">@summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-ivar">@name</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
150
+ <span class="ruby-keyword kw">end</span>
151
+ </pre>
152
+ </div>
153
+ </div>
154
+ </div>
155
+
156
+
157
+ </div>
158
+
159
+
160
+ </div>
161
+
162
+
163
+ <div id="validator-badges">
164
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
165
+ </div>
166
+
167
+ </body>
168
+ </html>
@@ -0,0 +1,220 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::TextHash</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::TextHash</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/hasher_rb.html">
59
+ lib/ankusa/hasher.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Hash
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000016">add_text</a>&nbsp;&nbsp;
90
+ <a href="#M000017">add_word</a>&nbsp;&nbsp;
91
+ <a href="#M000015">new</a>&nbsp;&nbsp;
92
+ </div>
93
+ </div>
94
+
95
+ </div>
96
+
97
+
98
+ <!-- if includes -->
99
+
100
+ <div id="section">
101
+
102
+
103
+
104
+
105
+
106
+ <div id="attribute-list">
107
+ <h3 class="section-bar">Attributes</h3>
108
+
109
+ <div class="name-list">
110
+ <table>
111
+ <tr class="top-aligned-row context-row">
112
+ <td class="context-item-name">word_count</td>
113
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
114
+ <td class="context-item-desc"></td>
115
+ </tr>
116
+ </table>
117
+ </div>
118
+ </div>
119
+
120
+
121
+
122
+ <!-- if method_list -->
123
+ <div id="methods">
124
+ <h3 class="section-bar">Public Class methods</h3>
125
+
126
+ <div id="method-M000015" class="method-detail">
127
+ <a name="M000015"></a>
128
+
129
+ <div class="method-heading">
130
+ <a href="#M000015" class="method-signature">
131
+ <span class="method-name">new</span><span class="method-args">(text=nil)</span>
132
+ </a>
133
+ </div>
134
+
135
+ <div class="method-description">
136
+ <p><a class="source-toggle" href="#"
137
+ onclick="toggleCode('M000015-source');return false;">[Source]</a></p>
138
+ <div class="method-source-code" id="M000015-source">
139
+ <pre>
140
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
141
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
142
+ <span class="ruby-keyword kw">super</span> <span class="ruby-value">0</span>
143
+ <span class="ruby-ivar">@word_count</span> = <span class="ruby-value">0</span>
144
+ <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">nil?</span>
145
+ <span class="ruby-keyword kw">end</span>
146
+ </pre>
147
+ </div>
148
+ </div>
149
+ </div>
150
+
151
+ <h3 class="section-bar">Public Instance methods</h3>
152
+
153
+ <div id="method-M000016" class="method-detail">
154
+ <a name="M000016"></a>
155
+
156
+ <div class="method-heading">
157
+ <a href="#M000016" class="method-signature">
158
+ <span class="method-name">add_text</span><span class="method-args">(text)</span>
159
+ </a>
160
+ </div>
161
+
162
+ <div class="method-description">
163
+ <p><a class="source-toggle" href="#"
164
+ onclick="toggleCode('M000016-source');return false;">[Source]</a></p>
165
+ <div class="method-source-code" id="M000016-source">
166
+ <pre>
167
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
168
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
169
+ <span class="ruby-comment cmt"># replace dashes with spaces, then get rid of non-word/non-space characters, </span>
170
+ <span class="ruby-comment cmt"># then split by space to get words</span>
171
+ <span class="ruby-identifier">words</span> = <span class="ruby-identifier">text</span>.<span class="ruby-identifier">tr</span>(<span class="ruby-value str">'-'</span>, <span class="ruby-value str">' '</span>).<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[^\w\s]/</span>,<span class="ruby-value str">&quot;&quot;</span>).<span class="ruby-identifier">split</span>
172
+ <span class="ruby-identifier">words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_word</span> <span class="ruby-identifier">word</span> }
173
+ <span class="ruby-keyword kw">self</span>
174
+ <span class="ruby-keyword kw">end</span>
175
+ </pre>
176
+ </div>
177
+ </div>
178
+ </div>
179
+
180
+ <div id="method-M000017" class="method-detail">
181
+ <a name="M000017"></a>
182
+
183
+ <div class="method-heading">
184
+ <a href="#M000017" class="method-signature">
185
+ <span class="method-name">add_word</span><span class="method-args">(word)</span>
186
+ </a>
187
+ </div>
188
+
189
+ <div class="method-description">
190
+ <p><a class="source-toggle" href="#"
191
+ onclick="toggleCode('M000017-source');return false;">[Source]</a></p>
192
+ <div class="method-source-code" id="M000017-source">
193
+ <pre>
194
+ <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 23</span>
195
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
196
+ <span class="ruby-identifier">word</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
197
+ <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">STOPWORDS</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">word</span>
198
+ <span class="ruby-ivar">@word_count</span> <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
199
+ <span class="ruby-identifier">key</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
200
+ <span class="ruby-identifier">store</span> <span class="ruby-identifier">key</span>, <span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">key</span>, <span class="ruby-value">0</span>)<span class="ruby-operator">+</span><span class="ruby-value">1</span>
201
+ <span class="ruby-keyword kw">end</span>
202
+ <span class="ruby-keyword kw">end</span>
203
+ </pre>
204
+ </div>
205
+ </div>
206
+ </div>
207
+
208
+
209
+ </div>
210
+
211
+
212
+ </div>
213
+
214
+
215
+ <div id="validator-badges">
216
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
217
+ </div>
218
+
219
+ </body>
220
+ </html>
data/docs/created.rid ADDED
@@ -0,0 +1 @@
1
+ Tue, 30 Nov 2010 16:28:42 -0500
@@ -0,0 +1,141 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>File: README.rdoc</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="fileHeader">
50
+ <h1>README.rdoc</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>README.rdoc
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Tue Nov 30 14:55:47 -0500 2010</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+ <div id="description">
72
+ <h1>ankusa</h1>
73
+ <p>
74
+ <a href="../classes/Ankusa.html">Ankusa</a> is a Naive Bayes classifier in
75
+ Ruby that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
76
+ backend, the training corpus can be many terabytes in size.
77
+ </p>
78
+ <h2>Installation</h2>
79
+ <p>
80
+ First, install HBase / Hadoop. Make sure the HBase Thrift interface has
81
+ been started as well. Then:
82
+ </p>
83
+ <pre>
84
+ gem install ankusa
85
+ </pre>
86
+ <h2>Basic Usage</h2>
87
+ <pre>
88
+ require 'rubygems'
89
+ require 'ankusa'
90
+ require 'hbaserb'
91
+
92
+ # connect to HBase
93
+ client = HBaseRb::Client.new 'localhost'
94
+
95
+ c = Classifier.new client
96
+ c.train :spam, &quot;This is some spammy text&quot;
97
+ c.train :good, &quot;This is not the bad stuff&quot;
98
+
99
+ # This will return the most likely class (as symbol)
100
+ puts c.classify &quot;This is some spammy text&quot;
101
+
102
+ # This will return Hash with classes as keys and
103
+ # membership probability as values
104
+ puts c.classifications &quot;This is some spammy text&quot;
105
+
106
+ # get a list of all classes
107
+ puts c.classes
108
+ </pre>
109
+
110
+ </div>
111
+
112
+
113
+ </div>
114
+
115
+
116
+ </div>
117
+
118
+
119
+ <!-- if includes -->
120
+
121
+ <div id="section">
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+ <!-- if method_list -->
131
+
132
+
133
+ </div>
134
+
135
+
136
+ <div id="validator-badges">
137
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
138
+ </div>
139
+
140
+ </body>
141
+ </html>