ankusa 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,12 +10,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
10
10
  == Basic Usage
11
11
  require 'rubygems'
12
12
  require 'ankusa'
13
- require 'hbaserb'
14
13
 
15
14
  # connect to HBase
16
- client = HBaseRb::Client.new 'localhost'
15
+ storage = Ankusa::HBaseStorage.new 'localhost'
16
+ c = Ankusa::Classifier.new storage
17
17
 
18
- c = Classifier.new client
19
18
  c.train :spam, "This is some spammy text"
20
19
  c.train :good, "This is not the bad stuff"
21
20
 
@@ -27,4 +26,7 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
27
26
  puts c.classifications "This is some spammy text"
28
27
 
29
28
  # get a list of all classes
30
- puts c.classes
29
+ puts c.classes
30
+
31
+ # close connection
32
+ storage.close
data/Rakefile CHANGED
@@ -22,9 +22,9 @@ Rake::TestTask.new("test") { |t|
22
22
 
23
23
  spec = Gem::Specification.new do |s|
24
24
  s.name = "ankusa"
25
- s.version = "0.0.2"
25
+ s.version = "0.0.3"
26
26
  s.authors = ["Brian Muller"]
27
- s.date = %q{2010-11-29}
27
+ s.date = %q{2010-12-02}
28
28
  s.description = "Naive Bayes classifier with HBase storage"
29
29
  s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
30
30
  s.email = "brian.muller@livingsocial.com"
@@ -63,8 +63,12 @@
63
63
  lib/ankusa/hasher.rb
64
64
  </a>
65
65
  <br />
66
- <a href="../files/lib/ankusa/nbclass_rb.html">
67
- lib/ankusa/nbclass.rb
66
+ <a href="../files/lib/ankusa/hbase_storage_rb.html">
67
+ lib/ankusa/hbase_storage.rb
68
+ </a>
69
+ <br />
70
+ <a href="../files/lib/ankusa/memory_storage_rb.html">
71
+ lib/ankusa/memory_storage.rb
68
72
  </a>
69
73
  <br />
70
74
  <a href="../files/lib/ankusa/stopwords_rb.html">
@@ -100,7 +104,8 @@
100
104
  <h3 class="section-bar">Classes and Modules</h3>
101
105
 
102
106
  Class <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
103
- Class <a href="Ankusa/NBClass.html" class="link">Ankusa::NBClass</a><br />
107
+ Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
108
+ Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
104
109
  Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
105
110
 
106
111
  </div>
@@ -110,11 +115,6 @@ Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
110
115
 
111
116
  <div class="name-list">
112
117
  <table summary="Constants">
113
- <tr class="top-aligned-row context-row">
114
- <td class="context-item-name">SMALL_PROB</td>
115
- <td>=</td>
116
- <td class="context-item-value">0.0001</td>
117
- </tr>
118
118
  <tr class="top-aligned-row context-row">
119
119
  <td class="context-item-name">STOPWORDS</td>
120
120
  <td>=</td>
@@ -86,19 +86,12 @@
86
86
  <h3 class="section-bar">Methods</h3>
87
87
 
88
88
  <div class="name-list">
89
- <a href="#M000005">classifications</a>&nbsp;&nbsp;
90
- <a href="#M000004">classify</a>&nbsp;&nbsp;
91
- <a href="#M000009">doc_count_total</a>&nbsp;&nbsp;
92
- <a href="#M000007">drop_tables</a>&nbsp;&nbsp;
93
- <a href="#M000013">freq_table</a>&nbsp;&nbsp;
94
- <a href="#M000010">get_word_probs</a>&nbsp;&nbsp;
95
- <a href="#M000011">init_tables</a>&nbsp;&nbsp;
96
- <a href="#M000001">new</a>&nbsp;&nbsp;
97
- <a href="#M000006">refresh_classnames</a>&nbsp;&nbsp;
98
- <a href="#M000008">reset</a>&nbsp;&nbsp;
99
- <a href="#M000012">summary_table</a>&nbsp;&nbsp;
100
- <a href="#M000002">train</a>&nbsp;&nbsp;
101
- <a href="#M000003">untrain</a>&nbsp;&nbsp;
89
+ <a href="#M000007">classifications</a>&nbsp;&nbsp;
90
+ <a href="#M000006">classify</a>&nbsp;&nbsp;
91
+ <a href="#M000008">get_word_probs</a>&nbsp;&nbsp;
92
+ <a href="#M000003">new</a>&nbsp;&nbsp;
93
+ <a href="#M000004">train</a>&nbsp;&nbsp;
94
+ <a href="#M000005">untrain</a>&nbsp;&nbsp;
102
95
  </div>
103
96
  </div>
104
97
 
@@ -133,27 +126,25 @@
133
126
  <div id="methods">
134
127
  <h3 class="section-bar">Public Class methods</h3>
135
128
 
136
- <div id="method-M000001" class="method-detail">
137
- <a name="M000001"></a>
129
+ <div id="method-M000003" class="method-detail">
130
+ <a name="M000003"></a>
138
131
 
139
132
  <div class="method-heading">
140
- <a href="#M000001" class="method-signature">
141
- <span class="method-name">new</span><span class="method-args">(hbase_client, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;)</span>
133
+ <a href="#M000003" class="method-signature">
134
+ <span class="method-name">new</span><span class="method-args">(storage)</span>
142
135
  </a>
143
136
  </div>
144
137
 
145
138
  <div class="method-description">
146
139
  <p><a class="source-toggle" href="#"
147
- onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
148
- <div class="method-source-code" id="M000001-source">
140
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
141
+ <div class="method-source-code" id="M000003-source">
149
142
  <pre>
150
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 7</span>
151
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">hbase_client</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">&quot;ankusa_word_frequencies&quot;</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">&quot;ankusa_summary&quot;</span>)
152
- <span class="ruby-ivar">@hbase</span> = <span class="ruby-identifier">hbase_client</span>
153
- <span class="ruby-ivar">@ftablename</span> = <span class="ruby-identifier">frequency_tablename</span>
154
- <span class="ruby-ivar">@stablename</span> = <span class="ruby-identifier">summary_tablename</span>
155
- <span class="ruby-identifier">init_tables</span>
156
- <span class="ruby-ivar">@classnames</span> = <span class="ruby-identifier">refresh_classnames</span>
143
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 6</span>
144
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">storage</span>)
145
+ <span class="ruby-ivar">@storage</span> = <span class="ruby-identifier">storage</span>
146
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">init_tables</span>
147
+ <span class="ruby-ivar">@classnames</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">classnames</span>
157
148
  <span class="ruby-keyword kw">end</span>
158
149
  </pre>
159
150
  </div>
@@ -162,120 +153,39 @@
162
153
 
163
154
  <h3 class="section-bar">Public Instance methods</h3>
164
155
 
165
- <div id="method-M000005" class="method-detail">
166
- <a name="M000005"></a>
156
+ <div id="method-M000007" class="method-detail">
157
+ <a name="M000007"></a>
167
158
 
168
159
  <div class="method-heading">
169
- <a href="#M000005" class="method-signature">
160
+ <a href="#M000007" class="method-signature">
170
161
  <span class="method-name">classifications</span><span class="method-args">(text)</span>
171
162
  </a>
172
163
  </div>
173
164
 
174
165
  <div class="method-description">
175
166
  <p><a class="source-toggle" href="#"
176
- onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
177
- <div class="method-source-code" id="M000005-source">
167
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
168
+ <div class="method-source-code" id="M000007-source">
178
169
  <pre>
179
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
170
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 44</span>
180
171
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
181
- <span class="ruby-identifier">classes</span> = {}
182
- <span class="ruby-identifier">result</span> = {}
183
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
184
- <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">NBClass</span>.<span class="ruby-identifier">new</span> <span class="ruby-identifier">k</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>
185
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span>
186
- }
172
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
187
173
 
188
- <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>,<span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
189
- <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
190
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) }
174
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
175
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
176
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
191
177
  }
192
-
193
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>].<span class="ruby-identifier">doc_count</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) }
194
-
195
- <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) }
196
- <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
197
- <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">klass</span><span class="ruby-operator">|</span>
198
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span>
199
- }
200
-
201
- <span class="ruby-identifier">result</span>
202
- <span class="ruby-keyword kw">end</span>
203
- </pre>
204
- </div>
205
- </div>
206
- </div>
207
-
208
- <div id="method-M000004" class="method-detail">
209
- <a name="M000004"></a>
210
-
211
- <div class="method-heading">
212
- <a href="#M000004" class="method-signature">
213
- <span class="method-name">classify</span><span class="method-args">(text)</span>
214
- </a>
215
- </div>
216
-
217
- <div class="method-description">
218
- <p><a class="source-toggle" href="#"
219
- onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
220
- <div class="method-source-code" id="M000004-source">
221
- <pre>
222
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 34</span>
223
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
224
- <span class="ruby-comment cmt"># return the most probable class</span>
225
- <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">o</span>,<span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">o</span>[<span class="ruby-value">1</span>] <span class="ruby-operator">&lt;=&gt;</span> <span class="ruby-identifier">t</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
226
- <span class="ruby-keyword kw">end</span>
227
- </pre>
228
- </div>
229
- </div>
230
- </div>
231
-
232
- <div id="method-M000009" class="method-detail">
233
- <a name="M000009"></a>
234
178
 
235
- <div class="method-heading">
236
- <a href="#M000009" class="method-signature">
237
- <span class="method-name">doc_count_total</span><span class="method-args">()</span>
238
- </a>
239
- </div>
240
-
241
- <div class="method-description">
242
- <p><a class="source-toggle" href="#"
243
- onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
244
- <div class="method-source-code" id="M000009-source">
245
- <pre>
246
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 84</span>
247
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
248
- <span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
249
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
250
- <span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">&quot;totals:doccount&quot;</span>].<span class="ruby-identifier">to_i64</span>
179
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
180
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
181
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
182
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
251
183
  }
252
- <span class="ruby-identifier">total</span>
253
- <span class="ruby-keyword kw">end</span>
254
- </pre>
255
- </div>
256
- </div>
257
- </div>
258
-
259
- <div id="method-M000007" class="method-detail">
260
- <a name="M000007"></a>
261
-
262
- <div class="method-heading">
263
- <a href="#M000007" class="method-signature">
264
- <span class="method-name">drop_tables</span><span class="method-args">()</span>
265
- </a>
266
- </div>
267
184
 
268
- <div class="method-description">
269
- <p><a class="source-toggle" href="#"
270
- onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
271
- <div class="method-source-code" id="M000007-source">
272
- <pre>
273
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 72</span>
274
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
275
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">delete</span>
276
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">delete</span>
277
- <span class="ruby-ivar">@stable</span> = <span class="ruby-keyword kw">nil</span>
278
- <span class="ruby-ivar">@ftable</span> = <span class="ruby-keyword kw">nil</span>
185
+ <span class="ruby-comment cmt"># normalize to get probs</span>
186
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
187
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
188
+ <span class="ruby-identifier">result</span>
279
189
  <span class="ruby-keyword kw">end</span>
280
190
  </pre>
281
191
  </div>
@@ -287,77 +197,52 @@
287
197
 
288
198
  <div class="method-heading">
289
199
  <a href="#M000006" class="method-signature">
290
- <span class="method-name">refresh_classnames</span><span class="method-args">()</span>
200
+ <span class="method-name">classify</span><span class="method-args">(text)</span>
291
201
  </a>
292
202
  </div>
293
203
 
294
204
  <div class="method-description">
295
- <p>
296
- get all classes
297
- </p>
298
205
  <p><a class="source-toggle" href="#"
299
206
  onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
300
207
  <div class="method-source-code" id="M000006-source">
301
208
  <pre>
302
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 64</span>
303
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">refresh_classnames</span>
304
- <span class="ruby-identifier">cs</span> = []
305
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
306
- <span class="ruby-identifier">cs</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>
307
- }
308
- <span class="ruby-identifier">cs</span>
309
- <span class="ruby-keyword kw">end</span>
310
- </pre>
311
- </div>
312
- </div>
313
- </div>
314
-
315
- <div id="method-M000008" class="method-detail">
316
- <a name="M000008"></a>
317
-
318
- <div class="method-heading">
319
- <a href="#M000008" class="method-signature">
320
- <span class="method-name">reset</span><span class="method-args">()</span>
321
- </a>
322
- </div>
323
-
324
- <div class="method-description">
325
- <p><a class="source-toggle" href="#"
326
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
327
- <div class="method-source-code" id="M000008-source">
328
- <pre>
329
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 79</span>
330
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
331
- <span class="ruby-identifier">drop_tables</span>
332
- <span class="ruby-identifier">init_tables</span>
209
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
210
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
211
+ <span class="ruby-comment cmt"># return the most probable class</span>
212
+ <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
333
213
  <span class="ruby-keyword kw">end</span>
334
214
  </pre>
335
215
  </div>
336
216
  </div>
337
217
  </div>
338
218
 
339
- <div id="method-M000002" class="method-detail">
340
- <a name="M000002"></a>
219
+ <div id="method-M000004" class="method-detail">
220
+ <a name="M000004"></a>
341
221
 
342
222
  <div class="method-heading">
343
- <a href="#M000002" class="method-signature">
344
- <span class="method-name">train</span><span class="method-args">(klass, text)</span>
223
+ <a href="#M000004" class="method-signature">
224
+ <span class="method-name">train</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
345
225
  </a>
346
226
  </div>
347
227
 
348
228
  <div class="method-description">
229
+ <p>
230
+ text can be either an array of strings or a string klass is a symbol
231
+ </p>
349
232
  <p><a class="source-toggle" href="#"
350
- onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
351
- <div class="method-source-code" id="M000002-source">
233
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
234
+ <div class="method-source-code" id="M000004-source">
352
235
  <pre>
353
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 15</span>
236
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 14</span>
354
237
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">train</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
355
238
  <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
356
239
  <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
357
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-identifier">count</span>
240
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>
241
+ <span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
358
242
  }
359
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
360
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>
243
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
244
+ <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
245
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
361
246
  <span class="ruby-ivar">@classnames</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
362
247
  <span class="ruby-keyword kw">end</span>
363
248
  </pre>
@@ -365,28 +250,33 @@ get all classes
365
250
  </div>
366
251
  </div>
367
252
 
368
- <div id="method-M000003" class="method-detail">
369
- <a name="M000003"></a>
253
+ <div id="method-M000005" class="method-detail">
254
+ <a name="M000005"></a>
370
255
 
371
256
  <div class="method-heading">
372
- <a href="#M000003" class="method-signature">
373
- <span class="method-name">untrain</span><span class="method-args">(klass, text)</span>
257
+ <a href="#M000005" class="method-signature">
258
+ <span class="method-name">untrain</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
374
259
  </a>
375
260
  </div>
376
261
 
377
262
  <div class="method-description">
263
+ <p>
264
+ text can be either an array of strings or a string klass is a symbol
265
+ </p>
378
266
  <p><a class="source-toggle" href="#"
379
- onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
380
- <div class="method-source-code" id="M000003-source">
267
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
268
+ <div class="method-source-code" id="M000005-source">
381
269
  <pre>
382
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 25</span>
270
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 28</span>
383
271
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
384
272
  <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
385
273
  <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
386
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
274
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
275
+ <span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
387
276
  }
388
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
389
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>, <span class="ruby-value">-1</span>
277
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
278
+ <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
279
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
390
280
  <span class="ruby-keyword kw">end</span>
391
281
  </pre>
392
282
  </div>
@@ -395,53 +285,26 @@ get all classes
395
285
 
396
286
  <h3 class="section-bar">Protected Instance methods</h3>
397
287
 
398
- <div id="method-M000013" class="method-detail">
399
- <a name="M000013"></a>
400
-
401
- <div class="method-heading">
402
- <a href="#M000013" class="method-signature">
403
- <span class="method-name">freq_table</span><span class="method-args">()</span>
404
- </a>
405
- </div>
406
-
407
- <div class="method-description">
408
- <p><a class="source-toggle" href="#"
409
- onclick="toggleCode('M000013-source');return false;">[Source]</a></p>
410
- <div class="method-source-code" id="M000013-source">
411
- <pre>
412
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 120</span>
413
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
414
- <span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
415
- <span class="ruby-keyword kw">end</span>
416
- </pre>
417
- </div>
418
- </div>
419
- </div>
420
-
421
- <div id="method-M000010" class="method-detail">
422
- <a name="M000010"></a>
288
+ <div id="method-M000008" class="method-detail">
289
+ <a name="M000008"></a>
423
290
 
424
291
  <div class="method-heading">
425
- <a href="#M000010" class="method-signature">
426
- <span class="method-name">get_word_probs</span><span class="method-args">(word, classes)</span>
292
+ <a href="#M000008" class="method-signature">
293
+ <span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
427
294
  </a>
428
295
  </div>
429
296
 
430
297
  <div class="method-description">
431
298
  <p><a class="source-toggle" href="#"
432
- onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
433
- <div class="method-source-code" id="M000010-source">
299
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
300
+ <div class="method-source-code" id="M000008-source">
434
301
  <pre>
435
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 93</span>
436
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
437
- <span class="ruby-identifier">probs</span> = {}
438
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">SMALL_PROB</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">cn</span>].<span class="ruby-identifier">word_count</span> }
439
- <span class="ruby-identifier">row</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">get_row</span>(<span class="ruby-identifier">word</span>)
440
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">probs</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
441
-
442
- <span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
443
- <span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
444
- <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">classname</span>].<span class="ruby-identifier">word_count</span>
302
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 65</span>
303
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
304
+ <span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
305
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
306
+ <span class="ruby-comment cmt"># use a laplacian smoother</span>
307
+ <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
445
308
  }
446
309
  <span class="ruby-identifier">probs</span>
447
310
  <span class="ruby-keyword kw">end</span>
@@ -450,58 +313,6 @@ get all classes
450
313
  </div>
451
314
  </div>
452
315
 
453
- <div id="method-M000011" class="method-detail">
454
- <a name="M000011"></a>
455
-
456
- <div class="method-heading">
457
- <a href="#M000011" class="method-signature">
458
- <span class="method-name">init_tables</span><span class="method-args">()</span>
459
- </a>
460
- </div>
461
-
462
- <div class="method-description">
463
- <p><a class="source-toggle" href="#"
464
- onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
465
- <div class="method-source-code" id="M000011-source">
466
- <pre>
467
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 106</span>
468
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
469
- <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@ftablename</span>
470
- <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@ftablename</span>, <span class="ruby-value str">&quot;classes&quot;</span>, <span class="ruby-value str">&quot;total&quot;</span>
471
- <span class="ruby-keyword kw">end</span>
472
-
473
- <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@stablename</span>
474
- <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@stablename</span>, <span class="ruby-value str">&quot;totals&quot;</span>
475
- <span class="ruby-keyword kw">end</span>
476
- <span class="ruby-keyword kw">end</span>
477
- </pre>
478
- </div>
479
- </div>
480
- </div>
481
-
482
- <div id="method-M000012" class="method-detail">
483
- <a name="M000012"></a>
484
-
485
- <div class="method-heading">
486
- <a href="#M000012" class="method-signature">
487
- <span class="method-name">summary_table</span><span class="method-args">()</span>
488
- </a>
489
- </div>
490
-
491
- <div class="method-description">
492
- <p><a class="source-toggle" href="#"
493
- onclick="toggleCode('M000012-source');return false;">[Source]</a></p>
494
- <div class="method-source-code" id="M000012-source">
495
- <pre>
496
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 116</span>
497
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
498
- <span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
499
- <span class="ruby-keyword kw">end</span>
500
- </pre>
501
- </div>
502
- </div>
503
- </div>
504
-
505
316
 
506
317
  </div>
507
318