ankusa 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,231 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::NaiveBayesClassifier</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::NaiveBayesClassifier</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/naive_bayes_rb.html">
59
+ lib/ankusa/naive_bayes.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000025">classifications</a>&nbsp;&nbsp;
90
+ <a href="#M000024">classify</a>&nbsp;&nbsp;
91
+ <a href="#M000026">log_likelihoods</a>&nbsp;&nbsp;
92
+ </div>
93
+ </div>
94
+
95
+ </div>
96
+
97
+
98
+ <!-- if includes -->
99
+ <div id="includes">
100
+ <h3 class="section-bar">Included Modules</h3>
101
+
102
+ <div id="includes-list">
103
+ <span class="include-name"><a href="Classifier.html">Classifier</a></span>
104
+ </div>
105
+ </div>
106
+
107
+ <div id="section">
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+ <!-- if method_list -->
117
+ <div id="methods">
118
+ <h3 class="section-bar">Public Instance methods</h3>
119
+
120
+ <div id="method-M000025" class="method-detail">
121
+ <a name="M000025"></a>
122
+
123
+ <div class="method-heading">
124
+ <a href="#M000025" class="method-signature">
125
+ <span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
126
+ </a>
127
+ </div>
128
+
129
+ <div class="method-description">
130
+ <p>
131
+ Classes is an array of classes to look at
132
+ </p>
133
+ <p><a class="source-toggle" href="#"
134
+ onclick="toggleCode('M000025-source');return false;">[Source]</a></p>
135
+ <div class="method-source-code" id="M000025-source">
136
+ <pre>
137
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 12</span>
138
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
139
+ <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
140
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
141
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
142
+ }
143
+
144
+ <span class="ruby-comment cmt"># normalize to get probs</span>
145
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
146
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
147
+ <span class="ruby-identifier">result</span>
148
+ <span class="ruby-keyword kw">end</span>
149
+ </pre>
150
+ </div>
151
+ </div>
152
+ </div>
153
+
154
+ <div id="method-M000024" class="method-detail">
155
+ <a name="M000024"></a>
156
+
157
+ <div class="method-heading">
158
+ <a href="#M000024" class="method-signature">
159
+ <span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
160
+ </a>
161
+ </div>
162
+
163
+ <div class="method-description">
164
+ <p><a class="source-toggle" href="#"
165
+ onclick="toggleCode('M000024-source');return false;">[Source]</a></p>
166
+ <div class="method-source-code" id="M000024-source">
167
+ <pre>
168
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 6</span>
169
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
170
+ <span class="ruby-comment cmt"># return the most probable class</span>
171
+ <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
172
+ <span class="ruby-keyword kw">end</span>
173
+ </pre>
174
+ </div>
175
+ </div>
176
+ </div>
177
+
178
+ <div id="method-M000026" class="method-detail">
179
+ <a name="M000026"></a>
180
+
181
+ <div class="method-heading">
182
+ <a href="#M000026" class="method-signature">
183
+ <span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
184
+ </a>
185
+ </div>
186
+
187
+ <div class="method-description">
188
+ <p>
189
+ Classes is an array of classes to look at
190
+ </p>
191
+ <p><a class="source-toggle" href="#"
192
+ onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
193
+ <div class="method-source-code" id="M000026-source">
194
+ <pre>
195
+ <span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 25</span>
196
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
197
+ <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
198
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
199
+
200
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
201
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
202
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
203
+ }
204
+
205
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
206
+ <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
207
+ <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
208
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
209
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
210
+ }
211
+
212
+ <span class="ruby-identifier">result</span>
213
+ <span class="ruby-keyword kw">end</span>
214
+ </pre>
215
+ </div>
216
+ </div>
217
+ </div>
218
+
219
+
220
+ </div>
221
+
222
+
223
+ </div>
224
+
225
+
226
+ <div id="validator-badges">
227
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
228
+ </div>
229
+
230
+ </body>
231
+ </html>
@@ -86,11 +86,11 @@
86
86
  <h3 class="section-bar">Methods</h3>
87
87
 
88
88
  <div class="name-list">
89
- <a href="#M000044">add_text</a>&nbsp;&nbsp;
90
- <a href="#M000045">add_word</a>&nbsp;&nbsp;
91
- <a href="#M000046">atomize</a>&nbsp;&nbsp;
92
- <a href="#M000043">new</a>&nbsp;&nbsp;
93
- <a href="#M000047">valid_word?</a>&nbsp;&nbsp;
89
+ <a href="#M000061">add_text</a>&nbsp;&nbsp;
90
+ <a href="#M000062">add_word</a>&nbsp;&nbsp;
91
+ <a href="#M000063">atomize</a>&nbsp;&nbsp;
92
+ <a href="#M000060">new</a>&nbsp;&nbsp;
93
+ <a href="#M000064">valid_word?</a>&nbsp;&nbsp;
94
94
  </div>
95
95
  </div>
96
96
 
@@ -125,19 +125,19 @@
125
125
  <div id="methods">
126
126
  <h3 class="section-bar">Public Class methods</h3>
127
127
 
128
- <div id="method-M000046" class="method-detail">
129
- <a name="M000046"></a>
128
+ <div id="method-M000063" class="method-detail">
129
+ <a name="M000063"></a>
130
130
 
131
131
  <div class="method-heading">
132
- <a href="#M000046" class="method-signature">
132
+ <a href="#M000063" class="method-signature">
133
133
  <span class="method-name">atomize</span><span class="method-args">(text)</span>
134
134
  </a>
135
135
  </div>
136
136
 
137
137
  <div class="method-description">
138
138
  <p><a class="source-toggle" href="#"
139
- onclick="toggleCode('M000046-source');return false;">[Source]</a></p>
140
- <div class="method-source-code" id="M000046-source">
139
+ onclick="toggleCode('M000063-source');return false;">[Source]</a></p>
140
+ <div class="method-source-code" id="M000063-source">
141
141
  <pre>
142
142
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 33</span>
143
143
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
@@ -148,19 +148,19 @@
148
148
  </div>
149
149
  </div>
150
150
 
151
- <div id="method-M000043" class="method-detail">
152
- <a name="M000043"></a>
151
+ <div id="method-M000060" class="method-detail">
152
+ <a name="M000060"></a>
153
153
 
154
154
  <div class="method-heading">
155
- <a href="#M000043" class="method-signature">
155
+ <a href="#M000060" class="method-signature">
156
156
  <span class="method-name">new</span><span class="method-args">(text=nil)</span>
157
157
  </a>
158
158
  </div>
159
159
 
160
160
  <div class="method-description">
161
161
  <p><a class="source-toggle" href="#"
162
- onclick="toggleCode('M000043-source');return false;">[Source]</a></p>
163
- <div class="method-source-code" id="M000043-source">
162
+ onclick="toggleCode('M000060-source');return false;">[Source]</a></p>
163
+ <div class="method-source-code" id="M000060-source">
164
164
  <pre>
165
165
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
166
166
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
@@ -173,11 +173,11 @@
173
173
  </div>
174
174
  </div>
175
175
 
176
- <div id="method-M000047" class="method-detail">
177
- <a name="M000047"></a>
176
+ <div id="method-M000064" class="method-detail">
177
+ <a name="M000064"></a>
178
178
 
179
179
  <div class="method-heading">
180
- <a href="#M000047" class="method-signature">
180
+ <a href="#M000064" class="method-signature">
181
181
  <span class="method-name">valid_word?</span><span class="method-args">(word)</span>
182
182
  </a>
183
183
  </div>
@@ -187,8 +187,8 @@
187
187
  word should be only alphanum chars at this point
188
188
  </p>
189
189
  <p><a class="source-toggle" href="#"
190
- onclick="toggleCode('M000047-source');return false;">[Source]</a></p>
191
- <div class="method-source-code" id="M000047-source">
190
+ onclick="toggleCode('M000064-source');return false;">[Source]</a></p>
191
+ <div class="method-source-code" id="M000064-source">
192
192
  <pre>
193
193
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 38</span>
194
194
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
@@ -204,19 +204,19 @@ word should be only alphanum chars at this point
204
204
 
205
205
  <h3 class="section-bar">Public Instance methods</h3>
206
206
 
207
- <div id="method-M000044" class="method-detail">
208
- <a name="M000044"></a>
207
+ <div id="method-M000061" class="method-detail">
208
+ <a name="M000061"></a>
209
209
 
210
210
  <div class="method-heading">
211
- <a href="#M000044" class="method-signature">
211
+ <a href="#M000061" class="method-signature">
212
212
  <span class="method-name">add_text</span><span class="method-args">(text)</span>
213
213
  </a>
214
214
  </div>
215
215
 
216
216
  <div class="method-description">
217
217
  <p><a class="source-toggle" href="#"
218
- onclick="toggleCode('M000044-source');return false;">[Source]</a></p>
219
- <div class="method-source-code" id="M000044-source">
218
+ onclick="toggleCode('M000061-source');return false;">[Source]</a></p>
219
+ <div class="method-source-code" id="M000061-source">
220
220
  <pre>
221
221
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
222
222
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
@@ -235,19 +235,19 @@ word should be only alphanum chars at this point
235
235
  </div>
236
236
  </div>
237
237
 
238
- <div id="method-M000045" class="method-detail">
239
- <a name="M000045"></a>
238
+ <div id="method-M000062" class="method-detail">
239
+ <a name="M000062"></a>
240
240
 
241
241
  <div class="method-heading">
242
- <a href="#M000045" class="method-signature">
242
+ <a href="#M000062" class="method-signature">
243
243
  <span class="method-name">add_word</span><span class="method-args">(word)</span>
244
244
  </a>
245
245
  </div>
246
246
 
247
247
  <div class="method-description">
248
248
  <p><a class="source-toggle" href="#"
249
- onclick="toggleCode('M000045-source');return false;">[Source]</a></p>
250
- <div class="method-source-code" id="M000045-source">
249
+ onclick="toggleCode('M000062-source');return false;">[Source]</a></p>
250
+ <div class="method-source-code" id="M000062-source">
251
251
  <pre>
252
252
  <span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 27</span>
253
253
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
@@ -1 +1 @@
1
- Mon, 06 Dec 2010 15:40:49 -0500
1
+ Sun, 12 Dec 2010 13:34:32 -0500
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Mon Dec 06 15:30:41 -0500 2010</td>
59
+ <td>Sun Dec 12 13:30:40 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -72,31 +72,45 @@
72
72
  <h1>ankusa</h1>
73
73
  <p>
74
74
  <a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
75
- that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
76
- backend, the training corpus can be many terabytes in size.
75
+ that can use either Hadoop&#8216;s HBase or Cassandra for storage. Because
76
+ it uses HBase or Cassandra as a backend, the training corpus can be many
77
+ terabytes in size.
77
78
  </p>
78
79
  <p>
79
- <a href="../classes/Ankusa.html">Ankusa</a> currently uses a Naive Bayes
80
- classifier. It ignores common words (a.k.a, stop words) and stems all
81
- others. Additionally, it uses Laplacian smoothing in the classification
82
- method.
80
+ <a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
81
+ Bayes and Kullback-Leibler divergence classifier. It ignores common words
82
+ (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian
83
+ smoothing in both classification methods.
83
84
  </p>
84
85
  <h2>Installation</h2>
85
86
  <p>
86
- First, install HBase / Hadoop. Make sure the HBase Thrift interface has
87
- been started as well. Then:
87
+ First, install HBase/Hadoop or Cassandra (&gt;= 0.7.0-rc2). Then, install
88
+ the appropriate gem:
89
+ </p>
90
+ <pre>
91
+ gem install hbaserb
92
+ # or
93
+ gem install cassandra
94
+ </pre>
95
+ <p>
96
+ If you&#8216;re using HBase, make sure the HBase Thrift interface has been
97
+ started as well. Then:
88
98
  </p>
89
99
  <pre>
90
100
  gem install ankusa
91
101
  </pre>
92
102
  <h2>Basic Usage</h2>
103
+ <p>
104
+ Using the naive Bayes classifier:
105
+ </p>
93
106
  <pre>
94
107
  require 'rubygems'
95
108
  require 'ankusa'
109
+ require 'ankusa/hbase_storage'
96
110
 
97
111
  # connect to HBase
98
112
  storage = Ankusa::HBaseStorage.new 'localhost'
99
- c = Ankusa::Classifier.new storage
113
+ c = Ankusa::NaiveBayesClassifier.new storage
100
114
 
101
115
  # Each of these calls will return a bag-of-words
102
116
  # has with stemmed words as keys and counts as values
@@ -116,11 +130,118 @@ been started as well. Then:
116
130
  puts c.log_likelihoods &quot;This is some spammy text&quot;
117
131
 
118
132
  # get a list of all classes
119
- puts c.classes
133
+ puts c.classnames
120
134
 
121
135
  # close connection
122
136
  storage.close
123
137
  </pre>
138
+ <h2>KL Diverence Classifier</h2>
139
+ <p>
140
+ There is a Kullback–Leibler divergence classifier as well. KL divergence
141
+ is a distance measure (though not a true metric because it does not satisfy
142
+ the triangle inequality). The KL classifier simply measures the relative
143
+ entropy between the text you want to classify and each of the classes. The
144
+ class with the shortest &quot;distance&quot; is the best class. You may
145
+ find that for a especially large corpus it may be slightly faster to use
146
+ this classifier (since prior probablities are never calculated, only
147
+ likelihoods).
148
+ </p>
149
+ <p>
150
+ The API is the same as the NaiveBayesClassifier, except rather than calling
151
+ &quot;classifications&quot; if you want actual numbers you call
152
+ &quot;distances&quot;.
153
+ </p>
154
+ <pre>
155
+ require 'rubygems'
156
+ require 'ankusa'
157
+ require 'ankusa/hbase_storage'
158
+
159
+ # connect to HBase
160
+ storage = Ankusa::HBaseStorage.new 'localhost'
161
+ c = Ankusa::KLDivergenceClassifier.new storage
162
+
163
+ # Each of these calls will return a bag-of-words
164
+ # has with stemmed words as keys and counts as values
165
+ c.train :spam, &quot;This is some spammy text&quot;
166
+ c.train :good, &quot;This is not the bad stuff&quot;
167
+
168
+ # This will return the most likely class (as symbol)
169
+ puts c.classify &quot;This is some spammy text&quot;
170
+
171
+ # This will return Hash with classes as keys and
172
+ # distances &gt;= 0 as values
173
+ puts c.distances &quot;This is some spammy text&quot;
174
+
175
+ # get a list of all classes
176
+ puts c.classnames
177
+
178
+ # close connection
179
+ storage.close
180
+ </pre>
181
+ <h2>Storage Methods</h2>
182
+ <p>
183
+ <a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
184
+ interface that has been implemented for HBase, Cassandra, and in-memory
185
+ storage.
186
+ </p>
187
+ <p>
188
+ Memory storage can be used when you have a very small corpora
189
+ </p>
190
+ <pre>
191
+ require 'ankusa/memory_storage'
192
+ storage = Ankusa::MemoryStorage.new
193
+ </pre>
194
+ <p>
195
+ HBase storage:
196
+ </p>
197
+ <pre>
198
+ require 'ankusa/hbase_storage'
199
+ # defaults: host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;
200
+ storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
201
+ </pre>
202
+ <p>
203
+ For Cassandra storage:
204
+ </p>
205
+ <ul>
206
+ <li>You will need Cassandra version 0.7.0-rc2 or greater.
207
+
208
+ </li>
209
+ <li>You will need to set a max number classes since current implementation of
210
+ the Ruby Cassandra client doesn&#8216;t support table scans.
211
+
212
+ </li>
213
+ <li>Prior to using the Cassandra storage you will need to run the following
214
+ command from the cassandra-cli: &quot;create keyspace ankusa with
215
+ replication_factor = 1&quot;. This should be fixed with a new release
216
+ candidate for Cassandra.
217
+
218
+ </li>
219
+ </ul>
220
+ <p>
221
+ To use the Cassandra storage class:
222
+ </p>
223
+ <pre>
224
+ require 'ankusa/cassandra_storage'
225
+ # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
226
+ storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
227
+ </pre>
228
+ <h2>Running Tests</h2>
229
+ <p>
230
+ You can run the tests for any of the three storage methods. For instance,
231
+ for memory storage:
232
+ </p>
233
+ <pre>
234
+ rake test_memory
235
+ </pre>
236
+ <p>
237
+ For the other methods you will need to edit the file test/config.yml and
238
+ set the configuration params. Then:
239
+ </p>
240
+ <pre>
241
+ rake test_hbase
242
+ # or
243
+ rake test_cassandra
244
+ </pre>
124
245
 
125
246
  </div>
126
247