ankusa 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,12 +10,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
10
10
  == Basic Usage
11
11
  require 'rubygems'
12
12
  require 'ankusa'
13
- require 'hbaserb'
14
13
 
15
14
  # connect to HBase
16
- client = HBaseRb::Client.new 'localhost'
15
+ storage = Ankusa::HBaseStorage.new 'localhost'
16
+ c = Ankusa::Classifier.new storage
17
17
 
18
- c = Classifier.new client
19
18
  c.train :spam, "This is some spammy text"
20
19
  c.train :good, "This is not the bad stuff"
21
20
 
@@ -27,4 +26,7 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
27
26
  puts c.classifications "This is some spammy text"
28
27
 
29
28
  # get a list of all classes
30
- puts c.classes
29
+ puts c.classes
30
+
31
+ # close connection
32
+ storage.close
data/Rakefile CHANGED
@@ -22,9 +22,9 @@ Rake::TestTask.new("test") { |t|
22
22
 
23
23
  spec = Gem::Specification.new do |s|
24
24
  s.name = "ankusa"
25
- s.version = "0.0.2"
25
+ s.version = "0.0.3"
26
26
  s.authors = ["Brian Muller"]
27
- s.date = %q{2010-11-29}
27
+ s.date = %q{2010-12-02}
28
28
  s.description = "Naive Bayes classifier with HBase storage"
29
29
  s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
30
30
  s.email = "brian.muller@livingsocial.com"
@@ -63,8 +63,12 @@
63
63
  lib/ankusa/hasher.rb
64
64
  </a>
65
65
  <br />
66
- <a href="../files/lib/ankusa/nbclass_rb.html">
67
- lib/ankusa/nbclass.rb
66
+ <a href="../files/lib/ankusa/hbase_storage_rb.html">
67
+ lib/ankusa/hbase_storage.rb
68
+ </a>
69
+ <br />
70
+ <a href="../files/lib/ankusa/memory_storage_rb.html">
71
+ lib/ankusa/memory_storage.rb
68
72
  </a>
69
73
  <br />
70
74
  <a href="../files/lib/ankusa/stopwords_rb.html">
@@ -100,7 +104,8 @@
100
104
  <h3 class="section-bar">Classes and Modules</h3>
101
105
 
102
106
  Class <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
103
- Class <a href="Ankusa/NBClass.html" class="link">Ankusa::NBClass</a><br />
107
+ Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
108
+ Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
104
109
  Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
105
110
 
106
111
  </div>
@@ -110,11 +115,6 @@ Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
110
115
 
111
116
  <div class="name-list">
112
117
  <table summary="Constants">
113
- <tr class="top-aligned-row context-row">
114
- <td class="context-item-name">SMALL_PROB</td>
115
- <td>=</td>
116
- <td class="context-item-value">0.0001</td>
117
- </tr>
118
118
  <tr class="top-aligned-row context-row">
119
119
  <td class="context-item-name">STOPWORDS</td>
120
120
  <td>=</td>
@@ -86,19 +86,12 @@
86
86
  <h3 class="section-bar">Methods</h3>
87
87
 
88
88
  <div class="name-list">
89
- <a href="#M000005">classifications</a>&nbsp;&nbsp;
90
- <a href="#M000004">classify</a>&nbsp;&nbsp;
91
- <a href="#M000009">doc_count_total</a>&nbsp;&nbsp;
92
- <a href="#M000007">drop_tables</a>&nbsp;&nbsp;
93
- <a href="#M000013">freq_table</a>&nbsp;&nbsp;
94
- <a href="#M000010">get_word_probs</a>&nbsp;&nbsp;
95
- <a href="#M000011">init_tables</a>&nbsp;&nbsp;
96
- <a href="#M000001">new</a>&nbsp;&nbsp;
97
- <a href="#M000006">refresh_classnames</a>&nbsp;&nbsp;
98
- <a href="#M000008">reset</a>&nbsp;&nbsp;
99
- <a href="#M000012">summary_table</a>&nbsp;&nbsp;
100
- <a href="#M000002">train</a>&nbsp;&nbsp;
101
- <a href="#M000003">untrain</a>&nbsp;&nbsp;
89
+ <a href="#M000007">classifications</a>&nbsp;&nbsp;
90
+ <a href="#M000006">classify</a>&nbsp;&nbsp;
91
+ <a href="#M000008">get_word_probs</a>&nbsp;&nbsp;
92
+ <a href="#M000003">new</a>&nbsp;&nbsp;
93
+ <a href="#M000004">train</a>&nbsp;&nbsp;
94
+ <a href="#M000005">untrain</a>&nbsp;&nbsp;
102
95
  </div>
103
96
  </div>
104
97
 
@@ -133,27 +126,25 @@
133
126
  <div id="methods">
134
127
  <h3 class="section-bar">Public Class methods</h3>
135
128
 
136
- <div id="method-M000001" class="method-detail">
137
- <a name="M000001"></a>
129
+ <div id="method-M000003" class="method-detail">
130
+ <a name="M000003"></a>
138
131
 
139
132
  <div class="method-heading">
140
- <a href="#M000001" class="method-signature">
141
- <span class="method-name">new</span><span class="method-args">(hbase_client, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;)</span>
133
+ <a href="#M000003" class="method-signature">
134
+ <span class="method-name">new</span><span class="method-args">(storage)</span>
142
135
  </a>
143
136
  </div>
144
137
 
145
138
  <div class="method-description">
146
139
  <p><a class="source-toggle" href="#"
147
- onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
148
- <div class="method-source-code" id="M000001-source">
140
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
141
+ <div class="method-source-code" id="M000003-source">
149
142
  <pre>
150
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 7</span>
151
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">hbase_client</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">&quot;ankusa_word_frequencies&quot;</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">&quot;ankusa_summary&quot;</span>)
152
- <span class="ruby-ivar">@hbase</span> = <span class="ruby-identifier">hbase_client</span>
153
- <span class="ruby-ivar">@ftablename</span> = <span class="ruby-identifier">frequency_tablename</span>
154
- <span class="ruby-ivar">@stablename</span> = <span class="ruby-identifier">summary_tablename</span>
155
- <span class="ruby-identifier">init_tables</span>
156
- <span class="ruby-ivar">@classnames</span> = <span class="ruby-identifier">refresh_classnames</span>
143
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 6</span>
144
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">storage</span>)
145
+ <span class="ruby-ivar">@storage</span> = <span class="ruby-identifier">storage</span>
146
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">init_tables</span>
147
+ <span class="ruby-ivar">@classnames</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">classnames</span>
157
148
  <span class="ruby-keyword kw">end</span>
158
149
  </pre>
159
150
  </div>
@@ -162,120 +153,39 @@
162
153
 
163
154
  <h3 class="section-bar">Public Instance methods</h3>
164
155
 
165
- <div id="method-M000005" class="method-detail">
166
- <a name="M000005"></a>
156
+ <div id="method-M000007" class="method-detail">
157
+ <a name="M000007"></a>
167
158
 
168
159
  <div class="method-heading">
169
- <a href="#M000005" class="method-signature">
160
+ <a href="#M000007" class="method-signature">
170
161
  <span class="method-name">classifications</span><span class="method-args">(text)</span>
171
162
  </a>
172
163
  </div>
173
164
 
174
165
  <div class="method-description">
175
166
  <p><a class="source-toggle" href="#"
176
- onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
177
- <div class="method-source-code" id="M000005-source">
167
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
168
+ <div class="method-source-code" id="M000007-source">
178
169
  <pre>
179
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
170
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 44</span>
180
171
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
181
- <span class="ruby-identifier">classes</span> = {}
182
- <span class="ruby-identifier">result</span> = {}
183
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
184
- <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">NBClass</span>.<span class="ruby-identifier">new</span> <span class="ruby-identifier">k</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>
185
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span>
186
- }
172
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
187
173
 
188
- <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>,<span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
189
- <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
190
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) }
174
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
175
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
176
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
191
177
  }
192
-
193
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>].<span class="ruby-identifier">doc_count</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) }
194
-
195
- <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) }
196
- <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
197
- <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">klass</span><span class="ruby-operator">|</span>
198
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span>
199
- }
200
-
201
- <span class="ruby-identifier">result</span>
202
- <span class="ruby-keyword kw">end</span>
203
- </pre>
204
- </div>
205
- </div>
206
- </div>
207
-
208
- <div id="method-M000004" class="method-detail">
209
- <a name="M000004"></a>
210
-
211
- <div class="method-heading">
212
- <a href="#M000004" class="method-signature">
213
- <span class="method-name">classify</span><span class="method-args">(text)</span>
214
- </a>
215
- </div>
216
-
217
- <div class="method-description">
218
- <p><a class="source-toggle" href="#"
219
- onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
220
- <div class="method-source-code" id="M000004-source">
221
- <pre>
222
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 34</span>
223
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
224
- <span class="ruby-comment cmt"># return the most probable class</span>
225
- <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">o</span>,<span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">o</span>[<span class="ruby-value">1</span>] <span class="ruby-operator">&lt;=&gt;</span> <span class="ruby-identifier">t</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
226
- <span class="ruby-keyword kw">end</span>
227
- </pre>
228
- </div>
229
- </div>
230
- </div>
231
-
232
- <div id="method-M000009" class="method-detail">
233
- <a name="M000009"></a>
234
178
 
235
- <div class="method-heading">
236
- <a href="#M000009" class="method-signature">
237
- <span class="method-name">doc_count_total</span><span class="method-args">()</span>
238
- </a>
239
- </div>
240
-
241
- <div class="method-description">
242
- <p><a class="source-toggle" href="#"
243
- onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
244
- <div class="method-source-code" id="M000009-source">
245
- <pre>
246
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 84</span>
247
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
248
- <span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
249
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
250
- <span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">&quot;totals:doccount&quot;</span>].<span class="ruby-identifier">to_i64</span>
179
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
180
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
181
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
182
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
251
183
  }
252
- <span class="ruby-identifier">total</span>
253
- <span class="ruby-keyword kw">end</span>
254
- </pre>
255
- </div>
256
- </div>
257
- </div>
258
-
259
- <div id="method-M000007" class="method-detail">
260
- <a name="M000007"></a>
261
-
262
- <div class="method-heading">
263
- <a href="#M000007" class="method-signature">
264
- <span class="method-name">drop_tables</span><span class="method-args">()</span>
265
- </a>
266
- </div>
267
184
 
268
- <div class="method-description">
269
- <p><a class="source-toggle" href="#"
270
- onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
271
- <div class="method-source-code" id="M000007-source">
272
- <pre>
273
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 72</span>
274
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
275
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">delete</span>
276
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">delete</span>
277
- <span class="ruby-ivar">@stable</span> = <span class="ruby-keyword kw">nil</span>
278
- <span class="ruby-ivar">@ftable</span> = <span class="ruby-keyword kw">nil</span>
185
+ <span class="ruby-comment cmt"># normalize to get probs</span>
186
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
187
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
188
+ <span class="ruby-identifier">result</span>
279
189
  <span class="ruby-keyword kw">end</span>
280
190
  </pre>
281
191
  </div>
@@ -287,77 +197,52 @@
287
197
 
288
198
  <div class="method-heading">
289
199
  <a href="#M000006" class="method-signature">
290
- <span class="method-name">refresh_classnames</span><span class="method-args">()</span>
200
+ <span class="method-name">classify</span><span class="method-args">(text)</span>
291
201
  </a>
292
202
  </div>
293
203
 
294
204
  <div class="method-description">
295
- <p>
296
- get all classes
297
- </p>
298
205
  <p><a class="source-toggle" href="#"
299
206
  onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
300
207
  <div class="method-source-code" id="M000006-source">
301
208
  <pre>
302
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 64</span>
303
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">refresh_classnames</span>
304
- <span class="ruby-identifier">cs</span> = []
305
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
306
- <span class="ruby-identifier">cs</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>
307
- }
308
- <span class="ruby-identifier">cs</span>
309
- <span class="ruby-keyword kw">end</span>
310
- </pre>
311
- </div>
312
- </div>
313
- </div>
314
-
315
- <div id="method-M000008" class="method-detail">
316
- <a name="M000008"></a>
317
-
318
- <div class="method-heading">
319
- <a href="#M000008" class="method-signature">
320
- <span class="method-name">reset</span><span class="method-args">()</span>
321
- </a>
322
- </div>
323
-
324
- <div class="method-description">
325
- <p><a class="source-toggle" href="#"
326
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
327
- <div class="method-source-code" id="M000008-source">
328
- <pre>
329
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 79</span>
330
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
331
- <span class="ruby-identifier">drop_tables</span>
332
- <span class="ruby-identifier">init_tables</span>
209
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
210
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
211
+ <span class="ruby-comment cmt"># return the most probable class</span>
212
+ <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
333
213
  <span class="ruby-keyword kw">end</span>
334
214
  </pre>
335
215
  </div>
336
216
  </div>
337
217
  </div>
338
218
 
339
- <div id="method-M000002" class="method-detail">
340
- <a name="M000002"></a>
219
+ <div id="method-M000004" class="method-detail">
220
+ <a name="M000004"></a>
341
221
 
342
222
  <div class="method-heading">
343
- <a href="#M000002" class="method-signature">
344
- <span class="method-name">train</span><span class="method-args">(klass, text)</span>
223
+ <a href="#M000004" class="method-signature">
224
+ <span class="method-name">train</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
345
225
  </a>
346
226
  </div>
347
227
 
348
228
  <div class="method-description">
229
+ <p>
230
+ text can be either an array of strings or a string klass is a symbol
231
+ </p>
349
232
  <p><a class="source-toggle" href="#"
350
- onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
351
- <div class="method-source-code" id="M000002-source">
233
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
234
+ <div class="method-source-code" id="M000004-source">
352
235
  <pre>
353
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 15</span>
236
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 14</span>
354
237
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">train</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
355
238
  <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
356
239
  <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
357
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-identifier">count</span>
240
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>
241
+ <span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
358
242
  }
359
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
360
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>
243
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
244
+ <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
245
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
361
246
  <span class="ruby-ivar">@classnames</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
362
247
  <span class="ruby-keyword kw">end</span>
363
248
  </pre>
@@ -365,28 +250,33 @@ get all classes
365
250
  </div>
366
251
  </div>
367
252
 
368
- <div id="method-M000003" class="method-detail">
369
- <a name="M000003"></a>
253
+ <div id="method-M000005" class="method-detail">
254
+ <a name="M000005"></a>
370
255
 
371
256
  <div class="method-heading">
372
- <a href="#M000003" class="method-signature">
373
- <span class="method-name">untrain</span><span class="method-args">(klass, text)</span>
257
+ <a href="#M000005" class="method-signature">
258
+ <span class="method-name">untrain</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
374
259
  </a>
375
260
  </div>
376
261
 
377
262
  <div class="method-description">
263
+ <p>
264
+ text can be either an array of strings or a string klass is a symbol
265
+ </p>
378
266
  <p><a class="source-toggle" href="#"
379
- onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
380
- <div class="method-source-code" id="M000003-source">
267
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
268
+ <div class="method-source-code" id="M000005-source">
381
269
  <pre>
382
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 25</span>
270
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 28</span>
383
271
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
384
272
  <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
385
273
  <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
386
- <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
274
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
275
+ <span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
387
276
  }
388
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
389
- <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>, <span class="ruby-value">-1</span>
277
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
278
+ <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
279
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
390
280
  <span class="ruby-keyword kw">end</span>
391
281
  </pre>
392
282
  </div>
@@ -395,53 +285,26 @@ get all classes
395
285
 
396
286
  <h3 class="section-bar">Protected Instance methods</h3>
397
287
 
398
- <div id="method-M000013" class="method-detail">
399
- <a name="M000013"></a>
400
-
401
- <div class="method-heading">
402
- <a href="#M000013" class="method-signature">
403
- <span class="method-name">freq_table</span><span class="method-args">()</span>
404
- </a>
405
- </div>
406
-
407
- <div class="method-description">
408
- <p><a class="source-toggle" href="#"
409
- onclick="toggleCode('M000013-source');return false;">[Source]</a></p>
410
- <div class="method-source-code" id="M000013-source">
411
- <pre>
412
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 120</span>
413
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
414
- <span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
415
- <span class="ruby-keyword kw">end</span>
416
- </pre>
417
- </div>
418
- </div>
419
- </div>
420
-
421
- <div id="method-M000010" class="method-detail">
422
- <a name="M000010"></a>
288
+ <div id="method-M000008" class="method-detail">
289
+ <a name="M000008"></a>
423
290
 
424
291
  <div class="method-heading">
425
- <a href="#M000010" class="method-signature">
426
- <span class="method-name">get_word_probs</span><span class="method-args">(word, classes)</span>
292
+ <a href="#M000008" class="method-signature">
293
+ <span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
427
294
  </a>
428
295
  </div>
429
296
 
430
297
  <div class="method-description">
431
298
  <p><a class="source-toggle" href="#"
432
- onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
433
- <div class="method-source-code" id="M000010-source">
299
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
300
+ <div class="method-source-code" id="M000008-source">
434
301
  <pre>
435
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 93</span>
436
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
437
- <span class="ruby-identifier">probs</span> = {}
438
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">SMALL_PROB</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">cn</span>].<span class="ruby-identifier">word_count</span> }
439
- <span class="ruby-identifier">row</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">get_row</span>(<span class="ruby-identifier">word</span>)
440
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">probs</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
441
-
442
- <span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
443
- <span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
444
- <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">classname</span>].<span class="ruby-identifier">word_count</span>
302
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 65</span>
303
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
304
+ <span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
305
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
306
+ <span class="ruby-comment cmt"># use a laplacian smoother</span>
307
+ <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
445
308
  }
446
309
  <span class="ruby-identifier">probs</span>
447
310
  <span class="ruby-keyword kw">end</span>
@@ -450,58 +313,6 @@ get all classes
450
313
  </div>
451
314
  </div>
452
315
 
453
- <div id="method-M000011" class="method-detail">
454
- <a name="M000011"></a>
455
-
456
- <div class="method-heading">
457
- <a href="#M000011" class="method-signature">
458
- <span class="method-name">init_tables</span><span class="method-args">()</span>
459
- </a>
460
- </div>
461
-
462
- <div class="method-description">
463
- <p><a class="source-toggle" href="#"
464
- onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
465
- <div class="method-source-code" id="M000011-source">
466
- <pre>
467
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 106</span>
468
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
469
- <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@ftablename</span>
470
- <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@ftablename</span>, <span class="ruby-value str">&quot;classes&quot;</span>, <span class="ruby-value str">&quot;total&quot;</span>
471
- <span class="ruby-keyword kw">end</span>
472
-
473
- <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@stablename</span>
474
- <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@stablename</span>, <span class="ruby-value str">&quot;totals&quot;</span>
475
- <span class="ruby-keyword kw">end</span>
476
- <span class="ruby-keyword kw">end</span>
477
- </pre>
478
- </div>
479
- </div>
480
- </div>
481
-
482
- <div id="method-M000012" class="method-detail">
483
- <a name="M000012"></a>
484
-
485
- <div class="method-heading">
486
- <a href="#M000012" class="method-signature">
487
- <span class="method-name">summary_table</span><span class="method-args">()</span>
488
- </a>
489
- </div>
490
-
491
- <div class="method-description">
492
- <p><a class="source-toggle" href="#"
493
- onclick="toggleCode('M000012-source');return false;">[Source]</a></p>
494
- <div class="method-source-code" id="M000012-source">
495
- <pre>
496
- <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 116</span>
497
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
498
- <span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
499
- <span class="ruby-keyword kw">end</span>
500
- </pre>
501
- </div>
502
- </div>
503
- </div>
504
-
505
316
 
506
317
  </div>
507
318