ankusa 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,231 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::NaiveBayesClassifier</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::NaiveBayesClassifier</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/naive_bayes_rb.html">
59
+ lib/ankusa/naive_bayes.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000025">classifications</a>&nbsp;&nbsp;
90
+ <a href="#M000024">classify</a>&nbsp;&nbsp;
91
+ <a href="#M000026">log_likelihoods</a>&nbsp;&nbsp;
92
+ </div>
93
+ </div>
94
+
95
+ </div>
96
+
97
+
98
+ <!-- if includes -->
99
+ <div id="includes">
100
+ <h3 class="section-bar">Included Modules</h3>
101
+
102
+ <div id="includes-list">
103
+ <span class="include-name"><a href="Classifier.html">Classifier</a></span>
104
+ </div>
105
+ </div>
106
+
107
+ <div id="section">
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+ <!-- if method_list -->
117
+ <div id="methods">
118
+ <h3 class="section-bar">Public Instance methods</h3>
119
+
120
+ <div id="method-M000025" class="method-detail">
121
+ <a name="M000025"></a>
122
+
123
+ <div class="method-heading">
124
+ <a href="#M000025" class="method-signature">
125
+ <span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
126
+ </a>
127
+ </div>
128
+
129
+ <div class="method-description">
130
+ <p>
131
+ Classes is an array of classes to look at
132
+ </p>
133
+ <p><a class="source-toggle" href="#"
134
+ onclick="toggleCode('M000025-source');return false;">[Source]</a></p>
135
+ <div class="method-source-code" id="M000025-source">
136
+ <pre>
137
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 12</span>
138
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
139
+ <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
140
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
141
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
142
+ }
143
+
144
+ <span class="ruby-comment cmt"># normalize to get probs</span>
145
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
146
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
147
+ <span class="ruby-identifier">result</span>
148
+ <span class="ruby-keyword kw">end</span>
149
+ </pre>
150
+ </div>
151
+ </div>
152
+ </div>
153
+
154
+ <div id="method-M000024" class="method-detail">
155
+ <a name="M000024"></a>
156
+
157
+ <div class="method-heading">
158
+ <a href="#M000024" class="method-signature">
159
+ <span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
160
+ </a>
161
+ </div>
162
+
163
+ <div class="method-description">
164
+ <p><a class="source-toggle" href="#"
165
+ onclick="toggleCode('M000024-source');return false;">[Source]</a></p>
166
+ <div class="method-source-code" id="M000024-source">
167
+ <pre>
168
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 6</span>
169
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
170
+ <span class="ruby-comment cmt"># return the most probable class</span>
171
+ <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
172
+ <span class="ruby-keyword kw">end</span>
173
+ </pre>
174
+ </div>
175
+ </div>
176
+ </div>
177
+
178
+ <div id="method-M000026" class="method-detail">
179
+ <a name="M000026"></a>
180
+
181
+ <div class="method-heading">
182
+ <a href="#M000026" class="method-signature">
183
+ <span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
184
+ </a>
185
+ </div>
186
+
187
+ <div class="method-description">
188
+ <p>
189
+ Classes is an array of classes to look at
190
+ </p>
191
+ <p><a class="source-toggle" href="#"
192
+ onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
193
+ <div class="method-source-code" id="M000026-source">
194
+ <pre>
195
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 25</span>
196
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
197
+ <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
198
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
199
+
200
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
201
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
202
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
203
+ }
204
+
205
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
206
+ <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
207
+ <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
208
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
209
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
210
+ }
211
+
212
+ <span class="ruby-identifier">result</span>
213
+ <span class="ruby-keyword kw">end</span>
214
+ </pre>
215
+ </div>
216
+ </div>
217
+ </div>
218
+
219
+
220
+ </div>
221
+
222
+
223
+ </div>
224
+
225
+
226
+ <div id="validator-badges">
227
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
228
+ </div>
229
+
230
+ </body>
231
+ </html>
@@ -86,11 +86,11 @@
86
86
  <h3 class="section-bar">Methods</h3>
87
87
 
88
88
  <div class="name-list">
89
- <a href="#M000044">add_text</a>&nbsp;&nbsp;
90
- <a href="#M000045">add_word</a>&nbsp;&nbsp;
91
- <a href="#M000046">atomize</a>&nbsp;&nbsp;
92
- <a href="#M000043">new</a>&nbsp;&nbsp;
93
- <a href="#M000047">valid_word?</a>&nbsp;&nbsp;
89
+ <a href="#M000061">add_text</a>&nbsp;&nbsp;
90
+ <a href="#M000062">add_word</a>&nbsp;&nbsp;
91
+ <a href="#M000063">atomize</a>&nbsp;&nbsp;
92
+ <a href="#M000060">new</a>&nbsp;&nbsp;
93
+ <a href="#M000064">valid_word?</a>&nbsp;&nbsp;
94
94
  </div>
95
95
  </div>
96
96
 
@@ -125,19 +125,19 @@
125
125
  <div id="methods">
126
126
  <h3 class="section-bar">Public Class methods</h3>
127
127
 
128
- <div id="method-M000046" class="method-detail">
129
- <a name="M000046"></a>
128
+ <div id="method-M000063" class="method-detail">
129
+ <a name="M000063"></a>
130
130
 
131
131
  <div class="method-heading">
132
- <a href="#M000046" class="method-signature">
132
+ <a href="#M000063" class="method-signature">
133
133
  <span class="method-name">atomize</span><span class="method-args">(text)</span>
134
134
  </a>
135
135
  </div>
136
136
 
137
137
  <div class="method-description">
138
138
  <p><a class="source-toggle" href="#"
139
- onclick="toggleCode('M000046-source');return false;">[Source]</a></p>
140
- <div class="method-source-code" id="M000046-source">
139
+ onclick="toggleCode('M000063-source');return false;">[Source]</a></p>
140
+ <div class="method-source-code" id="M000063-source">
141
141
  <pre>
142
142
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 33</span>
143
143
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
@@ -148,19 +148,19 @@
148
148
  </div>
149
149
  </div>
150
150
 
151
- <div id="method-M000043" class="method-detail">
152
- <a name="M000043"></a>
151
+ <div id="method-M000060" class="method-detail">
152
+ <a name="M000060"></a>
153
153
 
154
154
  <div class="method-heading">
155
- <a href="#M000043" class="method-signature">
155
+ <a href="#M000060" class="method-signature">
156
156
  <span class="method-name">new</span><span class="method-args">(text=nil)</span>
157
157
  </a>
158
158
  </div>
159
159
 
160
160
  <div class="method-description">
161
161
  <p><a class="source-toggle" href="#"
162
- onclick="toggleCode('M000043-source');return false;">[Source]</a></p>
163
- <div class="method-source-code" id="M000043-source">
162
+ onclick="toggleCode('M000060-source');return false;">[Source]</a></p>
163
+ <div class="method-source-code" id="M000060-source">
164
164
  <pre>
165
165
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
166
166
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
@@ -173,11 +173,11 @@
173
173
  </div>
174
174
  </div>
175
175
 
176
- <div id="method-M000047" class="method-detail">
177
- <a name="M000047"></a>
176
+ <div id="method-M000064" class="method-detail">
177
+ <a name="M000064"></a>
178
178
 
179
179
  <div class="method-heading">
180
- <a href="#M000047" class="method-signature">
180
+ <a href="#M000064" class="method-signature">
181
181
  <span class="method-name">valid_word?</span><span class="method-args">(word)</span>
182
182
  </a>
183
183
  </div>
@@ -187,8 +187,8 @@
187
187
  word should be only alphanum chars at this point
188
188
  </p>
189
189
  <p><a class="source-toggle" href="#"
190
- onclick="toggleCode('M000047-source');return false;">[Source]</a></p>
191
- <div class="method-source-code" id="M000047-source">
190
+ onclick="toggleCode('M000064-source');return false;">[Source]</a></p>
191
+ <div class="method-source-code" id="M000064-source">
192
192
  <pre>
193
193
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 38</span>
194
194
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
@@ -204,19 +204,19 @@ word should be only alphanum chars at this point
204
204
 
205
205
  <h3 class="section-bar">Public Instance methods</h3>
206
206
 
207
- <div id="method-M000044" class="method-detail">
208
- <a name="M000044"></a>
207
+ <div id="method-M000061" class="method-detail">
208
+ <a name="M000061"></a>
209
209
 
210
210
  <div class="method-heading">
211
- <a href="#M000044" class="method-signature">
211
+ <a href="#M000061" class="method-signature">
212
212
  <span class="method-name">add_text</span><span class="method-args">(text)</span>
213
213
  </a>
214
214
  </div>
215
215
 
216
216
  <div class="method-description">
217
217
  <p><a class="source-toggle" href="#"
218
- onclick="toggleCode('M000044-source');return false;">[Source]</a></p>
219
- <div class="method-source-code" id="M000044-source">
218
+ onclick="toggleCode('M000061-source');return false;">[Source]</a></p>
219
+ <div class="method-source-code" id="M000061-source">
220
220
  <pre>
221
221
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
222
222
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
@@ -235,19 +235,19 @@ word should be only alphanum chars at this point
235
235
  </div>
236
236
  </div>
237
237
 
238
- <div id="method-M000045" class="method-detail">
239
- <a name="M000045"></a>
238
+ <div id="method-M000062" class="method-detail">
239
+ <a name="M000062"></a>
240
240
 
241
241
  <div class="method-heading">
242
- <a href="#M000045" class="method-signature">
242
+ <a href="#M000062" class="method-signature">
243
243
  <span class="method-name">add_word</span><span class="method-args">(word)</span>
244
244
  </a>
245
245
  </div>
246
246
 
247
247
  <div class="method-description">
248
248
  <p><a class="source-toggle" href="#"
249
- onclick="toggleCode('M000045-source');return false;">[Source]</a></p>
250
- <div class="method-source-code" id="M000045-source">
249
+ onclick="toggleCode('M000062-source');return false;">[Source]</a></p>
250
+ <div class="method-source-code" id="M000062-source">
251
251
  <pre>
252
252
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 27</span>
253
253
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
@@ -1 +1 @@
1
- Mon, 06 Dec 2010 15:40:49 -0500
1
+ Sun, 12 Dec 2010 13:34:32 -0500
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Mon Dec 06 15:30:41 -0500 2010</td>
59
+ <td>Sun Dec 12 13:30:40 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -72,31 +72,45 @@
72
72
  <h1>ankusa</h1>
73
73
  <p>
74
74
  <a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
75
- that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
76
- backend, the training corpus can be many terabytes in size.
75
+ that can use either Hadoop&#8216;s HBase or Cassandra for storage. Because
76
+ it uses HBase or Cassandra as a backend, the training corpus can be many
77
+ terabytes in size.
77
78
  </p>
78
79
  <p>
79
- <a href="../classes/Ankusa.html">Ankusa</a> currently uses a Naive Bayes
80
- classifier. It ignores common words (a.k.a, stop words) and stems all
81
- others. Additionally, it uses Laplacian smoothing in the classification
82
- method.
80
+ <a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
81
+ Bayes and Kullback-Leibler divergence classifier. It ignores common words
82
+ (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian
83
+ smoothing in both classification methods.
83
84
  </p>
84
85
  <h2>Installation</h2>
85
86
  <p>
86
- First, install HBase / Hadoop. Make sure the HBase Thrift interface has
87
- been started as well. Then:
87
+ First, install HBase/Hadoop or Cassandra (&gt;= 0.7.0-rc2). Then, install
88
+ the appropriate gem:
89
+ </p>
90
+ <pre>
91
+ gem install hbaserb
92
+ # or
93
+ gem install cassandra
94
+ </pre>
95
+ <p>
96
+ If you&#8216;re using HBase, make sure the HBase Thrift interface has been
97
+ started as well. Then:
88
98
  </p>
89
99
  <pre>
90
100
  gem install ankusa
91
101
  </pre>
92
102
  <h2>Basic Usage</h2>
103
+ <p>
104
+ Using the naive Bayes classifier:
105
+ </p>
93
106
  <pre>
94
107
  require 'rubygems'
95
108
  require 'ankusa'
109
+ require 'ankusa/hbase_storage'
96
110
 
97
111
  # connect to HBase
98
112
  storage = Ankusa::HBaseStorage.new 'localhost'
99
- c = Ankusa::Classifier.new storage
113
+ c = Ankusa::NaiveBayesClassifier.new storage
100
114
 
101
115
  # Each of these calls will return a bag-of-words
102
116
  # has with stemmed words as keys and counts as values
@@ -116,11 +130,118 @@ been started as well. Then:
116
130
  puts c.log_likelihoods &quot;This is some spammy text&quot;
117
131
 
118
132
  # get a list of all classes
119
- puts c.classes
133
+ puts c.classnames
120
134
 
121
135
  # close connection
122
136
  storage.close
123
137
  </pre>
138
+ <h2>KL Diverence Classifier</h2>
139
+ <p>
140
+ There is a Kullback–Leibler divergence classifier as well. KL divergence
141
+ is a distance measure (though not a true metric because it does not satisfy
142
+ the triangle inequality). The KL classifier simply measures the relative
143
+ entropy between the text you want to classify and each of the classes. The
144
+ class with the shortest &quot;distance&quot; is the best class. You may
145
+ find that for a especially large corpus it may be slightly faster to use
146
+ this classifier (since prior probablities are never calculated, only
147
+ likelihoods).
148
+ </p>
149
+ <p>
150
+ The API is the same as the NaiveBayesClassifier, except rather than calling
151
+ &quot;classifications&quot; if you want actual numbers you call
152
+ &quot;distances&quot;.
153
+ </p>
154
+ <pre>
155
+ require 'rubygems'
156
+ require 'ankusa'
157
+ require 'ankusa/hbase_storage'
158
+
159
+ # connect to HBase
160
+ storage = Ankusa::HBaseStorage.new 'localhost'
161
+ c = Ankusa::KLDivergenceClassifier.new storage
162
+
163
+ # Each of these calls will return a bag-of-words
164
+ # has with stemmed words as keys and counts as values
165
+ c.train :spam, &quot;This is some spammy text&quot;
166
+ c.train :good, &quot;This is not the bad stuff&quot;
167
+
168
+ # This will return the most likely class (as symbol)
169
+ puts c.classify &quot;This is some spammy text&quot;
170
+
171
+ # This will return Hash with classes as keys and
172
+ # distances &gt;= 0 as values
173
+ puts c.distances &quot;This is some spammy text&quot;
174
+
175
+ # get a list of all classes
176
+ puts c.classnames
177
+
178
+ # close connection
179
+ storage.close
180
+ </pre>
181
+ <h2>Storage Methods</h2>
182
+ <p>
183
+ <a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
184
+ interface that has been implemented for HBase, Cassandra, and in-memory
185
+ storage.
186
+ </p>
187
+ <p>
188
+ Memory storage can be used when you have a very small corpora
189
+ </p>
190
+ <pre>
191
+ require 'ankusa/memory_storage'
192
+ storage = Ankusa::MemoryStorage.new
193
+ </pre>
194
+ <p>
195
+ HBase storage:
196
+ </p>
197
+ <pre>
198
+ require 'ankusa/hbase_storage'
199
+ # defaults: host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;
200
+ storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
201
+ </pre>
202
+ <p>
203
+ For Cassandra storage:
204
+ </p>
205
+ <ul>
206
+ <li>You will need Cassandra version 0.7.0-rc2 or greater.
207
+
208
+ </li>
209
+ <li>You will need to set a max number classes since current implementation of
210
+ the Ruby Cassandra client doesn&#8216;t support table scans.
211
+
212
+ </li>
213
+ <li>Prior to using the Cassandra storage you will need to run the following
214
+ command from the cassandra-cli: &quot;create keyspace ankusa with
215
+ replication_factor = 1&quot;. This should be fixed with a new release
216
+ candidate for Cassandra.
217
+
218
+ </li>
219
+ </ul>
220
+ <p>
221
+ To use the Cassandra storage class:
222
+ </p>
223
+ <pre>
224
+ require 'ankusa/cassandra_storage'
225
+ # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
226
+ storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
227
+ </pre>
228
+ <h2>Running Tests</h2>
229
+ <p>
230
+ You can run the tests for any of the three storage methods. For instance,
231
+ for memory storage:
232
+ </p>
233
+ <pre>
234
+ rake test_memory
235
+ </pre>
236
+ <p>
237
+ For the other methods you will need to edit the file test/config.yml and
238
+ set the configuration params. Then:
239
+ </p>
240
+ <pre>
241
+ rake test_hbase
242
+ # or
243
+ rake test_cassandra
244
+ </pre>
124
245
 
125
246
  </div>
126
247