ankusa 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,6 +1,8 @@
1
1
  = ankusa
2
2
 
3
- Ankusa is a Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size.
3
+ Ankusa is a text classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size.
4
+
5
+ Ankusa currently uses a Naive Bayes classifier. It ignores common words (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian smoothing in the classification method.
4
6
 
5
7
  == Installation
6
8
  First, install HBase / Hadoop. Make sure the HBase Thrift interface has been started as well. Then:
@@ -15,6 +17,8 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
15
17
  storage = Ankusa::HBaseStorage.new 'localhost'
16
18
  c = Ankusa::Classifier.new storage
17
19
 
20
+ # Each of these calls will return a bag-of-words
21
+ # has with stemmed words as keys and counts as values
18
22
  c.train :spam, "This is some spammy text"
19
23
  c.train :good, "This is not the bad stuff"
20
24
 
@@ -25,6 +29,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
25
29
  # membership probability as values
26
30
  puts c.classifications "This is some spammy text"
27
31
 
32
+ # If you have a large corpus, the probabilities will
33
+ # likely all be 0. In that case, you must use log
34
+ # likelihood values
35
+ puts c.log_likelihoods "This is some spammy text"
36
+
28
37
  # get a list of all classes
29
38
  puts c.classes
30
39
 
data/Rakefile CHANGED
@@ -22,11 +22,11 @@ Rake::TestTask.new("test") { |t|
22
22
 
23
23
  spec = Gem::Specification.new do |s|
24
24
  s.name = "ankusa"
25
- s.version = "0.0.5"
25
+ s.version = "0.0.6"
26
26
  s.authors = ["Brian Muller"]
27
- s.date = %q{2010-12-03}
28
- s.description = "Naive Bayes classifier with HBase storage"
29
- s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
27
+ s.date = %q{2010-12-06}
28
+ s.description = "Text classifier with HBase storage"
29
+ s.summary = "Text classifier in Ruby that uses Hadoop's HBase for storage"
30
30
  s.email = "brian.muller@livingsocial.com"
31
31
  s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
32
32
  s.homepage = "https://github.com/livingsocial/ankusa"
@@ -88,10 +88,13 @@
88
88
  <div class="name-list">
89
89
  <a href="#M000007">classifications</a>&nbsp;&nbsp;
90
90
  <a href="#M000006">classify</a>&nbsp;&nbsp;
91
- <a href="#M000008">get_word_probs</a>&nbsp;&nbsp;
91
+ <a href="#M000010">doc_count_totals</a>&nbsp;&nbsp;
92
+ <a href="#M000009">get_word_probs</a>&nbsp;&nbsp;
93
+ <a href="#M000008">log_likelihoods</a>&nbsp;&nbsp;
92
94
  <a href="#M000003">new</a>&nbsp;&nbsp;
93
95
  <a href="#M000004">train</a>&nbsp;&nbsp;
94
96
  <a href="#M000005">untrain</a>&nbsp;&nbsp;
97
+ <a href="#M000011">vocab_sizes</a>&nbsp;&nbsp;
95
98
  </div>
96
99
  </div>
97
100
 
@@ -158,33 +161,28 @@
158
161
 
159
162
  <div class="method-heading">
160
163
  <a href="#M000007" class="method-signature">
161
- <span class="method-name">classifications</span><span class="method-args">(text)</span>
164
+ <span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
162
165
  </a>
163
166
  </div>
164
167
 
165
168
  <div class="method-description">
169
+ <p>
170
+ Classes is an array of classes to look at
171
+ </p>
166
172
  <p><a class="source-toggle" href="#"
167
173
  onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
168
174
  <div class="method-source-code" id="M000007-source">
169
175
  <pre>
170
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 46</span>
171
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
172
- <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
173
-
174
- <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
175
- <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
176
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
176
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 53</span>
177
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
178
+ <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
179
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
180
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
177
181
  }
178
182
 
179
- <span class="ruby-comment cmt"># add the prior and exponentiate</span>
180
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
181
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
182
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
183
- }
184
-
185
183
  <span class="ruby-comment cmt"># normalize to get probs</span>
186
184
  <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
187
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
185
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
188
186
  <span class="ruby-identifier">result</span>
189
187
  <span class="ruby-keyword kw">end</span>
190
188
  </pre>
@@ -197,7 +195,7 @@
197
195
 
198
196
  <div class="method-heading">
199
197
  <a href="#M000006" class="method-signature">
200
- <span class="method-name">classify</span><span class="method-args">(text)</span>
198
+ <span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
201
199
  </a>
202
200
  </div>
203
201
 
@@ -206,10 +204,51 @@
206
204
  onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
207
205
  <div class="method-source-code" id="M000006-source">
208
206
  <pre>
209
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 41</span>
210
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
207
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 47</span>
208
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
211
209
  <span class="ruby-comment cmt"># return the most probable class</span>
212
- <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
210
+ <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
211
+ <span class="ruby-keyword kw">end</span>
212
+ </pre>
213
+ </div>
214
+ </div>
215
+ </div>
216
+
217
+ <div id="method-M000008" class="method-detail">
218
+ <a name="M000008"></a>
219
+
220
+ <div class="method-heading">
221
+ <a href="#M000008" class="method-signature">
222
+ <span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
223
+ </a>
224
+ </div>
225
+
226
+ <div class="method-description">
227
+ <p>
228
+ Classes is an array of classes to look at
229
+ </p>
230
+ <p><a class="source-toggle" href="#"
231
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
232
+ <div class="method-source-code" id="M000008-source">
233
+ <pre>
234
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 66</span>
235
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
236
+ <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
237
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
238
+
239
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
240
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
241
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
242
+ }
243
+
244
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
245
+ <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
246
+ <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
247
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
248
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
249
+ }
250
+
251
+ <span class="ruby-identifier">result</span>
213
252
  <span class="ruby-keyword kw">end</span>
214
253
  </pre>
215
254
  </div>
@@ -244,6 +283,9 @@ text can be either an array of strings or a string klass is a symbol
244
283
  <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
245
284
  <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
246
285
  <span class="ruby-ivar">@classnames</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
286
+ <span class="ruby-comment cmt"># cache is now dirty of these vars</span>
287
+ <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
288
+ <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
247
289
  <span class="ruby-identifier">th</span>
248
290
  <span class="ruby-keyword kw">end</span>
249
291
  </pre>
@@ -268,7 +310,7 @@ text can be either an array of strings or a string klass is a symbol
268
310
  onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
269
311
  <div class="method-source-code" id="M000005-source">
270
312
  <pre>
271
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 29</span>
313
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 32</span>
272
314
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
273
315
  <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
274
316
  <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
@@ -278,6 +320,9 @@ text can be either an array of strings or a string klass is a symbol
278
320
  <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
279
321
  <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
280
322
  <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
323
+ <span class="ruby-comment cmt"># cache is now dirty of these vars</span>
324
+ <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
325
+ <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
281
326
  <span class="ruby-identifier">th</span>
282
327
  <span class="ruby-keyword kw">end</span>
283
328
  </pre>
@@ -287,26 +332,51 @@ text can be either an array of strings or a string klass is a symbol
287
332
 
288
333
  <h3 class="section-bar">Protected Instance methods</h3>
289
334
 
290
- <div id="method-M000008" class="method-detail">
291
- <a name="M000008"></a>
335
+ <div id="method-M000010" class="method-detail">
336
+ <a name="M000010"></a>
292
337
 
293
338
  <div class="method-heading">
294
- <a href="#M000008" class="method-signature">
295
- <span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
339
+ <a href="#M000010" class="method-signature">
340
+ <span class="method-name">doc_count_totals</span><span class="method-args">()</span>
296
341
  </a>
297
342
  </div>
298
343
 
299
344
  <div class="method-description">
300
345
  <p><a class="source-toggle" href="#"
301
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
302
- <div class="method-source-code" id="M000008-source">
346
+ onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
347
+ <div class="method-source-code" id="M000010-source">
303
348
  <pre>
304
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 67</span>
305
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
306
- <span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
307
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
349
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 97</span>
350
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
351
+ <span class="ruby-ivar">@doc_count_totals</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_totals</span>
352
+ <span class="ruby-keyword kw">end</span>
353
+ </pre>
354
+ </div>
355
+ </div>
356
+ </div>
357
+
358
+ <div id="method-M000009" class="method-detail">
359
+ <a name="M000009"></a>
360
+
361
+ <div class="method-heading">
362
+ <a href="#M000009" class="method-signature">
363
+ <span class="method-name">get_word_probs</span><span class="method-args">(word, classnames)</span>
364
+ </a>
365
+ </div>
366
+
367
+ <div class="method-description">
368
+ <p><a class="source-toggle" href="#"
369
+ onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
370
+ <div class="method-source-code" id="M000009-source">
371
+ <pre>
372
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 86</span>
373
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
374
+ <span class="ruby-identifier">probs</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
375
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">v</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }
376
+ <span class="ruby-identifier">vs</span> = <span class="ruby-identifier">vocab_sizes</span>
377
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
308
378
  <span class="ruby-comment cmt"># use a laplacian smoother</span>
309
- <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
379
+ <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-identifier">vs</span>[<span class="ruby-identifier">cn</span>]).<span class="ruby-identifier">to_f</span>
310
380
  }
311
381
  <span class="ruby-identifier">probs</span>
312
382
  <span class="ruby-keyword kw">end</span>
@@ -315,6 +385,29 @@ text can be either an array of strings or a string klass is a symbol
315
385
  </div>
316
386
  </div>
317
387
 
388
+ <div id="method-M000011" class="method-detail">
389
+ <a name="M000011"></a>
390
+
391
+ <div class="method-heading">
392
+ <a href="#M000011" class="method-signature">
393
+ <span class="method-name">vocab_sizes</span><span class="method-args">()</span>
394
+ </a>
395
+ </div>
396
+
397
+ <div class="method-description">
398
+ <p><a class="source-toggle" href="#"
399
+ onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
400
+ <div class="method-source-code" id="M000011-source">
401
+ <pre>
402
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 101</span>
403
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">vocab_sizes</span>
404
+ <span class="ruby-ivar">@vocab_sizes</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_vocabulary_sizes</span>
405
+ <span class="ruby-keyword kw">end</span>
406
+ </pre>
407
+ </div>
408
+ </div>
409
+ </div>
410
+
318
411
 
319
412
  </div>
320
413
 
@@ -86,21 +86,23 @@
86
86
  <h3 class="section-bar">Methods</h3>
87
87
 
88
88
  <div class="name-list">
89
- <a href="#M000023">classnames</a>&nbsp;&nbsp;
90
- <a href="#M000034">close</a>&nbsp;&nbsp;
91
- <a href="#M000033">doc_count_total</a>&nbsp;&nbsp;
92
- <a href="#M000025">drop_tables</a>&nbsp;&nbsp;
93
- <a href="#M000036">freq_table</a>&nbsp;&nbsp;
94
- <a href="#M000029">get_doc_count</a>&nbsp;&nbsp;
95
- <a href="#M000028">get_total_word_count</a>&nbsp;&nbsp;
96
- <a href="#M000027">get_word_counts</a>&nbsp;&nbsp;
97
- <a href="#M000032">incr_doc_count</a>&nbsp;&nbsp;
98
- <a href="#M000031">incr_total_word_count</a>&nbsp;&nbsp;
99
- <a href="#M000030">incr_word_count</a>&nbsp;&nbsp;
100
- <a href="#M000026">init_tables</a>&nbsp;&nbsp;
101
- <a href="#M000022">new</a>&nbsp;&nbsp;
102
- <a href="#M000024">reset</a>&nbsp;&nbsp;
103
- <a href="#M000035">summary_table</a>&nbsp;&nbsp;
89
+ <a href="#M000027">classnames</a>&nbsp;&nbsp;
90
+ <a href="#M000039">close</a>&nbsp;&nbsp;
91
+ <a href="#M000038">doc_count_totals</a>&nbsp;&nbsp;
92
+ <a href="#M000029">drop_tables</a>&nbsp;&nbsp;
93
+ <a href="#M000042">freq_table</a>&nbsp;&nbsp;
94
+ <a href="#M000034">get_doc_count</a>&nbsp;&nbsp;
95
+ <a href="#M000040">get_summary</a>&nbsp;&nbsp;
96
+ <a href="#M000033">get_total_word_count</a>&nbsp;&nbsp;
97
+ <a href="#M000032">get_vocabulary_sizes</a>&nbsp;&nbsp;
98
+ <a href="#M000031">get_word_counts</a>&nbsp;&nbsp;
99
+ <a href="#M000037">incr_doc_count</a>&nbsp;&nbsp;
100
+ <a href="#M000036">incr_total_word_count</a>&nbsp;&nbsp;
101
+ <a href="#M000035">incr_word_count</a>&nbsp;&nbsp;
102
+ <a href="#M000030">init_tables</a>&nbsp;&nbsp;
103
+ <a href="#M000026">new</a>&nbsp;&nbsp;
104
+ <a href="#M000028">reset</a>&nbsp;&nbsp;
105
+ <a href="#M000041">summary_table</a>&nbsp;&nbsp;
104
106
  </div>
105
107
  </div>
106
108
 
@@ -135,19 +137,19 @@
135
137
  <div id="methods">
136
138
  <h3 class="section-bar">Public Class methods</h3>
137
139
 
138
- <div id="method-M000022" class="method-detail">
139
- <a name="M000022"></a>
140
+ <div id="method-M000026" class="method-detail">
141
+ <a name="M000026"></a>
140
142
 
141
143
  <div class="method-heading">
142
- <a href="#M000022" class="method-signature">
144
+ <a href="#M000026" class="method-signature">
143
145
  <span class="method-name">new</span><span class="method-args">(host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;)</span>
144
146
  </a>
145
147
  </div>
146
148
 
147
149
  <div class="method-description">
148
150
  <p><a class="source-toggle" href="#"
149
- onclick="toggleCode('M000022-source');return false;">[Source]</a></p>
150
- <div class="method-source-code" id="M000022-source">
151
+ onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
152
+ <div class="method-source-code" id="M000026-source">
151
153
  <pre>
152
154
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 8</span>
153
155
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">host</span>=<span class="ruby-value str">'localhost'</span>, <span class="ruby-identifier">port</span>=<span class="ruby-value">9090</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">&quot;ankusa_word_frequencies&quot;</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">&quot;ankusa_summary&quot;</span>)
@@ -165,19 +167,19 @@
165
167
 
166
168
  <h3 class="section-bar">Public Instance methods</h3>
167
169
 
168
- <div id="method-M000023" class="method-detail">
169
- <a name="M000023"></a>
170
+ <div id="method-M000027" class="method-detail">
171
+ <a name="M000027"></a>
170
172
 
171
173
  <div class="method-heading">
172
- <a href="#M000023" class="method-signature">
174
+ <a href="#M000027" class="method-signature">
173
175
  <span class="method-name">classnames</span><span class="method-args">()</span>
174
176
  </a>
175
177
  </div>
176
178
 
177
179
  <div class="method-description">
178
180
  <p><a class="source-toggle" href="#"
179
- onclick="toggleCode('M000023-source');return false;">[Source]</a></p>
180
- <div class="method-source-code" id="M000023-source">
181
+ onclick="toggleCode('M000027-source');return false;">[Source]</a></p>
182
+ <div class="method-source-code" id="M000027-source">
181
183
  <pre>
182
184
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 17</span>
183
185
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classnames</span>
@@ -192,21 +194,21 @@
192
194
  </div>
193
195
  </div>
194
196
 
195
- <div id="method-M000034" class="method-detail">
196
- <a name="M000034"></a>
197
+ <div id="method-M000039" class="method-detail">
198
+ <a name="M000039"></a>
197
199
 
198
200
  <div class="method-heading">
199
- <a href="#M000034" class="method-signature">
201
+ <a href="#M000039" class="method-signature">
200
202
  <span class="method-name">close</span><span class="method-args">()</span>
201
203
  </a>
202
204
  </div>
203
205
 
204
206
  <div class="method-description">
205
207
  <p><a class="source-toggle" href="#"
206
- onclick="toggleCode('M000034-source');return false;">[Source]</a></p>
207
- <div class="method-source-code" id="M000034-source">
208
+ onclick="toggleCode('M000039-source');return false;">[Source]</a></p>
209
+ <div class="method-source-code" id="M000039-source">
208
210
  <pre>
209
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 94</span>
211
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 103</span>
210
212
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">close</span>
211
213
  <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">close</span>
212
214
  <span class="ruby-keyword kw">end</span>
@@ -215,46 +217,42 @@
215
217
  </div>
216
218
  </div>
217
219
 
218
- <div id="method-M000033" class="method-detail">
219
- <a name="M000033"></a>
220
+ <div id="method-M000038" class="method-detail">
221
+ <a name="M000038"></a>
220
222
 
221
223
  <div class="method-heading">
222
- <a href="#M000033" class="method-signature">
223
- <span class="method-name">doc_count_total</span><span class="method-args">()</span>
224
+ <a href="#M000038" class="method-signature">
225
+ <span class="method-name">doc_count_totals</span><span class="method-args">()</span>
224
226
  </a>
225
227
  </div>
226
228
 
227
229
  <div class="method-description">
228
230
  <p><a class="source-toggle" href="#"
229
- onclick="toggleCode('M000033-source');return false;">[Source]</a></p>
230
- <div class="method-source-code" id="M000033-source">
231
+ onclick="toggleCode('M000038-source');return false;">[Source]</a></p>
232
+ <div class="method-source-code" id="M000038-source">
231
233
  <pre>
232
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 86</span>
233
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
234
- <span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
235
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
236
- <span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">&quot;totals:doccount&quot;</span>].<span class="ruby-identifier">to_i64</span>
237
- }
238
- <span class="ruby-identifier">total</span>
234
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 99</span>
235
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
236
+ <span class="ruby-identifier">get_summary</span> <span class="ruby-value str">&quot;totals:doccount&quot;</span>
239
237
  <span class="ruby-keyword kw">end</span>
240
238
  </pre>
241
239
  </div>
242
240
  </div>
243
241
  </div>
244
242
 
245
- <div id="method-M000025" class="method-detail">
246
- <a name="M000025"></a>
243
+ <div id="method-M000029" class="method-detail">
244
+ <a name="M000029"></a>
247
245
 
248
246
  <div class="method-heading">
249
- <a href="#M000025" class="method-signature">
247
+ <a href="#M000029" class="method-signature">
250
248
  <span class="method-name">drop_tables</span><span class="method-args">()</span>
251
249
  </a>
252
250
  </div>
253
251
 
254
252
  <div class="method-description">
255
253
  <p><a class="source-toggle" href="#"
256
- onclick="toggleCode('M000025-source');return false;">[Source]</a></p>
257
- <div class="method-source-code" id="M000025-source">
254
+ onclick="toggleCode('M000029-source');return false;">[Source]</a></p>
255
+ <div class="method-source-code" id="M000029-source">
258
256
  <pre>
259
257
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 30</span>
260
258
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
@@ -270,21 +268,21 @@
270
268
  </div>
271
269
  </div>
272
270
 
273
- <div id="method-M000029" class="method-detail">
274
- <a name="M000029"></a>
271
+ <div id="method-M000034" class="method-detail">
272
+ <a name="M000034"></a>
275
273
 
276
274
  <div class="method-heading">
277
- <a href="#M000029" class="method-signature">
275
+ <a href="#M000034" class="method-signature">
278
276
  <span class="method-name">get_doc_count</span><span class="method-args">(klass)</span>
279
277
  </a>
280
278
  </div>
281
279
 
282
280
  <div class="method-description">
283
281
  <p><a class="source-toggle" href="#"
284
- onclick="toggleCode('M000029-source');return false;">[Source]</a></p>
285
- <div class="method-source-code" id="M000029-source">
282
+ onclick="toggleCode('M000034-source');return false;">[Source]</a></p>
283
+ <div class="method-source-code" id="M000034-source">
286
284
  <pre>
287
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 68</span>
285
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 73</span>
288
286
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">klass</span>)
289
287
  <span class="ruby-ivar">@klass_doc_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
290
288
  <span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
@@ -295,21 +293,21 @@
295
293
  </div>
296
294
  </div>
297
295
 
298
- <div id="method-M000028" class="method-detail">
299
- <a name="M000028"></a>
296
+ <div id="method-M000033" class="method-detail">
297
+ <a name="M000033"></a>
300
298
 
301
299
  <div class="method-heading">
302
- <a href="#M000028" class="method-signature">
300
+ <a href="#M000033" class="method-signature">
303
301
  <span class="method-name">get_total_word_count</span><span class="method-args">(klass)</span>
304
302
  </a>
305
303
  </div>
306
304
 
307
305
  <div class="method-description">
308
306
  <p><a class="source-toggle" href="#"
309
- onclick="toggleCode('M000028-source');return false;">[Source]</a></p>
310
- <div class="method-source-code" id="M000028-source">
307
+ onclick="toggleCode('M000033-source');return false;">[Source]</a></p>
308
+ <div class="method-source-code" id="M000033-source">
311
309
  <pre>
312
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 62</span>
310
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 67</span>
313
311
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">klass</span>)
314
312
  <span class="ruby-ivar">@klass_word_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
315
313
  <span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
@@ -320,19 +318,42 @@
320
318
  </div>
321
319
  </div>
322
320
 
323
- <div id="method-M000027" class="method-detail">
324
- <a name="M000027"></a>
321
+ <div id="method-M000032" class="method-detail">
322
+ <a name="M000032"></a>
325
323
 
326
324
  <div class="method-heading">
327
- <a href="#M000027" class="method-signature">
325
+ <a href="#M000032" class="method-signature">
326
+ <span class="method-name">get_vocabulary_sizes</span><span class="method-args">()</span>
327
+ </a>
328
+ </div>
329
+
330
+ <div class="method-description">
331
+ <p><a class="source-toggle" href="#"
332
+ onclick="toggleCode('M000032-source');return false;">[Source]</a></p>
333
+ <div class="method-source-code" id="M000032-source">
334
+ <pre>
335
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 63</span>
336
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_vocabulary_sizes</span>
337
+ <span class="ruby-identifier">get_summary</span> <span class="ruby-value str">&quot;totals:vocabsize&quot;</span>
338
+ <span class="ruby-keyword kw">end</span>
339
+ </pre>
340
+ </div>
341
+ </div>
342
+ </div>
343
+
344
+ <div id="method-M000031" class="method-detail">
345
+ <a name="M000031"></a>
346
+
347
+ <div class="method-heading">
348
+ <a href="#M000031" class="method-signature">
328
349
  <span class="method-name">get_word_counts</span><span class="method-args">(word)</span>
329
350
  </a>
330
351
  </div>
331
352
 
332
353
  <div class="method-description">
333
354
  <p><a class="source-toggle" href="#"
334
- onclick="toggleCode('M000027-source');return false;">[Source]</a></p>
335
- <div class="method-source-code" id="M000027-source">
355
+ onclick="toggleCode('M000031-source');return false;">[Source]</a></p>
356
+ <div class="method-source-code" id="M000031-source">
336
357
  <pre>
337
358
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 49</span>
338
359
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
@@ -342,7 +363,8 @@
342
363
 
343
364
  <span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
344
365
  <span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
345
- <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
366
+ <span class="ruby-comment cmt"># in case untrain has been called too many times</span>
367
+ <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">classname</span>] = [<span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>, <span class="ruby-value">0</span>].<span class="ruby-identifier">max</span>
346
368
  }
347
369
 
348
370
  <span class="ruby-identifier">counts</span>
@@ -352,21 +374,21 @@
352
374
  </div>
353
375
  </div>
354
376
 
355
- <div id="method-M000032" class="method-detail">
356
- <a name="M000032"></a>
377
+ <div id="method-M000037" class="method-detail">
378
+ <a name="M000037"></a>
357
379
 
358
380
  <div class="method-heading">
359
- <a href="#M000032" class="method-signature">
381
+ <a href="#M000037" class="method-signature">
360
382
  <span class="method-name">incr_doc_count</span><span class="method-args">(klass, count)</span>
361
383
  </a>
362
384
  </div>
363
385
 
364
386
  <div class="method-description">
365
387
  <p><a class="source-toggle" href="#"
366
- onclick="toggleCode('M000032-source');return false;">[Source]</a></p>
367
- <div class="method-source-code" id="M000032-source">
388
+ onclick="toggleCode('M000037-source');return false;">[Source]</a></p>
389
+ <div class="method-source-code" id="M000037-source">
368
390
  <pre>
369
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 82</span>
391
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 95</span>
370
392
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_doc_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
371
393
  <span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>, <span class="ruby-identifier">count</span>
372
394
  <span class="ruby-keyword kw">end</span>
@@ -375,21 +397,21 @@
375
397
  </div>
376
398
  </div>
377
399
 
378
- <div id="method-M000031" class="method-detail">
379
- <a name="M000031"></a>
400
+ <div id="method-M000036" class="method-detail">
401
+ <a name="M000036"></a>
380
402
 
381
403
  <div class="method-heading">
382
- <a href="#M000031" class="method-signature">
404
+ <a href="#M000036" class="method-signature">
383
405
  <span class="method-name">incr_total_word_count</span><span class="method-args">(klass, count)</span>
384
406
  </a>
385
407
  </div>
386
408
 
387
409
  <div class="method-description">
388
410
  <p><a class="source-toggle" href="#"
389
- onclick="toggleCode('M000031-source');return false;">[Source]</a></p>
390
- <div class="method-source-code" id="M000031-source">
411
+ onclick="toggleCode('M000036-source');return false;">[Source]</a></p>
412
+ <div class="method-source-code" id="M000036-source">
391
413
  <pre>
392
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 78</span>
414
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 91</span>
393
415
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_total_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
394
416
  <span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-identifier">count</span>
395
417
  <span class="ruby-keyword kw">end</span>
@@ -398,42 +420,50 @@
398
420
  </div>
399
421
  </div>
400
422
 
401
- <div id="method-M000030" class="method-detail">
402
- <a name="M000030"></a>
423
+ <div id="method-M000035" class="method-detail">
424
+ <a name="M000035"></a>
403
425
 
404
426
  <div class="method-heading">
405
- <a href="#M000030" class="method-signature">
427
+ <a href="#M000035" class="method-signature">
406
428
  <span class="method-name">incr_word_count</span><span class="method-args">(klass, word, count)</span>
407
429
  </a>
408
430
  </div>
409
431
 
410
432
  <div class="method-description">
411
433
  <p><a class="source-toggle" href="#"
412
- onclick="toggleCode('M000030-source');return false;">[Source]</a></p>
413
- <div class="method-source-code" id="M000030-source">
434
+ onclick="toggleCode('M000035-source');return false;">[Source]</a></p>
435
+ <div class="method-source-code" id="M000035-source">
414
436
  <pre>
415
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 74</span>
437
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 79</span>
416
438
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>)
417
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-identifier">count</span>
439
+ <span class="ruby-identifier">size</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-identifier">count</span>
440
+ <span class="ruby-comment cmt"># if this is a new word, increase the klass's vocab size. If the new word</span>
441
+ <span class="ruby-comment cmt"># count is 0, then we need to decrement our vocab size</span>
442
+ <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">count</span>
443
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:vocabsize&quot;</span>
444
+ <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
445
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:vocabsize&quot;</span>, <span class="ruby-value">-1</span>
446
+ <span class="ruby-keyword kw">end</span>
447
+ <span class="ruby-identifier">size</span>
418
448
  <span class="ruby-keyword kw">end</span>
419
449
  </pre>
420
450
  </div>
421
451
  </div>
422
452
  </div>
423
453
 
424
- <div id="method-M000026" class="method-detail">
425
- <a name="M000026"></a>
454
+ <div id="method-M000030" class="method-detail">
455
+ <a name="M000030"></a>
426
456
 
427
457
  <div class="method-heading">
428
- <a href="#M000026" class="method-signature">
458
+ <a href="#M000030" class="method-signature">
429
459
  <span class="method-name">init_tables</span><span class="method-args">()</span>
430
460
  </a>
431
461
  </div>
432
462
 
433
463
  <div class="method-description">
434
464
  <p><a class="source-toggle" href="#"
435
- onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
436
- <div class="method-source-code" id="M000026-source">
465
+ onclick="toggleCode('M000030-source');return false;">[Source]</a></p>
466
+ <div class="method-source-code" id="M000030-source">
437
467
  <pre>
438
468
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 39</span>
439
469
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
@@ -450,19 +480,19 @@
450
480
  </div>
451
481
  </div>
452
482
 
453
- <div id="method-M000024" class="method-detail">
454
- <a name="M000024"></a>
483
+ <div id="method-M000028" class="method-detail">
484
+ <a name="M000028"></a>
455
485
 
456
486
  <div class="method-heading">
457
- <a href="#M000024" class="method-signature">
487
+ <a href="#M000028" class="method-signature">
458
488
  <span class="method-name">reset</span><span class="method-args">()</span>
459
489
  </a>
460
490
  </div>
461
491
 
462
492
  <div class="method-description">
463
493
  <p><a class="source-toggle" href="#"
464
- onclick="toggleCode('M000024-source');return false;">[Source]</a></p>
465
- <div class="method-source-code" id="M000024-source">
494
+ onclick="toggleCode('M000028-source');return false;">[Source]</a></p>
495
+ <div class="method-source-code" id="M000028-source">
466
496
  <pre>
467
497
  <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 25</span>
468
498
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
@@ -476,21 +506,21 @@
476
506
 
477
507
  <h3 class="section-bar">Protected Instance methods</h3>
478
508
 
479
- <div id="method-M000036" class="method-detail">
480
- <a name="M000036"></a>
509
+ <div id="method-M000042" class="method-detail">
510
+ <a name="M000042"></a>
481
511
 
482
512
  <div class="method-heading">
483
- <a href="#M000036" class="method-signature">
513
+ <a href="#M000042" class="method-signature">
484
514
  <span class="method-name">freq_table</span><span class="method-args">()</span>
485
515
  </a>
486
516
  </div>
487
517
 
488
518
  <div class="method-description">
489
519
  <p><a class="source-toggle" href="#"
490
- onclick="toggleCode('M000036-source');return false;">[Source]</a></p>
491
- <div class="method-source-code" id="M000036-source">
520
+ onclick="toggleCode('M000042-source');return false;">[Source]</a></p>
521
+ <div class="method-source-code" id="M000042-source">
492
522
  <pre>
493
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 103</span>
523
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 120</span>
494
524
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
495
525
  <span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
496
526
  <span class="ruby-keyword kw">end</span>
@@ -499,21 +529,48 @@
499
529
  </div>
500
530
  </div>
501
531
 
502
- <div id="method-M000035" class="method-detail">
503
- <a name="M000035"></a>
532
+ <div id="method-M000040" class="method-detail">
533
+ <a name="M000040"></a>
504
534
 
505
535
  <div class="method-heading">
506
- <a href="#M000035" class="method-signature">
536
+ <a href="#M000040" class="method-signature">
537
+ <span class="method-name">get_summary</span><span class="method-args">(name)</span>
538
+ </a>
539
+ </div>
540
+
541
+ <div class="method-description">
542
+ <p><a class="source-toggle" href="#"
543
+ onclick="toggleCode('M000040-source');return false;">[Source]</a></p>
544
+ <div class="method-source-code" id="M000040-source">
545
+ <pre>
546
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 108</span>
547
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_summary</span>(<span class="ruby-identifier">name</span>)
548
+ <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
549
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-identifier">name</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
550
+ <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>] = <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-identifier">name</span>].<span class="ruby-identifier">to_i64</span>
551
+ }
552
+ <span class="ruby-identifier">counts</span>
553
+ <span class="ruby-keyword kw">end</span>
554
+ </pre>
555
+ </div>
556
+ </div>
557
+ </div>
558
+
559
+ <div id="method-M000041" class="method-detail">
560
+ <a name="M000041"></a>
561
+
562
+ <div class="method-heading">
563
+ <a href="#M000041" class="method-signature">
507
564
  <span class="method-name">summary_table</span><span class="method-args">()</span>
508
565
  </a>
509
566
  </div>
510
567
 
511
568
  <div class="method-description">
512
569
  <p><a class="source-toggle" href="#"
513
- onclick="toggleCode('M000035-source');return false;">[Source]</a></p>
514
- <div class="method-source-code" id="M000035-source">
570
+ onclick="toggleCode('M000041-source');return false;">[Source]</a></p>
571
+ <div class="method-source-code" id="M000041-source">
515
572
  <pre>
516
- <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 99</span>
573
+ <span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 116</span>
517
574
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
518
575
  <span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
519
576
  <span class="ruby-keyword kw">end</span>