ankusa 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +10 -1
- data/Rakefile +4 -4
- data/docs/classes/Ankusa/Classifier.html +125 -32
- data/docs/classes/Ankusa/HBaseStorage.html +165 -108
- data/docs/classes/Ankusa/MemoryStorage.html +117 -89
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +16 -3
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +1 -1
- data/docs/files/lib/ankusa/memory_storage_rb.html +1 -1
- data/docs/fr_method_index.html +40 -34
- data/lib/ankusa/classifier.rb +44 -15
- data/lib/ankusa/hbase_storage.rb +25 -8
- data/lib/ankusa/memory_storage.rb +10 -2
- metadata +6 -6
data/README.rdoc
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
= ankusa
|
2
2
|
|
3
|
-
Ankusa is a
|
3
|
+
Ankusa is a text classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size.
|
4
|
+
|
5
|
+
Ankusa currently uses a Naive Bayes classifier. It ignores common words (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian smoothing in the classification method.
|
4
6
|
|
5
7
|
== Installation
|
6
8
|
First, install HBase / Hadoop. Make sure the HBase Thrift interface has been started as well. Then:
|
@@ -15,6 +17,8 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
15
17
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
16
18
|
c = Ankusa::Classifier.new storage
|
17
19
|
|
20
|
+
# Each of these calls will return a bag-of-words
|
21
|
+
# has with stemmed words as keys and counts as values
|
18
22
|
c.train :spam, "This is some spammy text"
|
19
23
|
c.train :good, "This is not the bad stuff"
|
20
24
|
|
@@ -25,6 +29,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
25
29
|
# membership probability as values
|
26
30
|
puts c.classifications "This is some spammy text"
|
27
31
|
|
32
|
+
# If you have a large corpus, the probabilities will
|
33
|
+
# likely all be 0. In that case, you must use log
|
34
|
+
# likelihood values
|
35
|
+
puts c.log_likelihoods "This is some spammy text"
|
36
|
+
|
28
37
|
# get a list of all classes
|
29
38
|
puts c.classes
|
30
39
|
|
data/Rakefile
CHANGED
@@ -22,11 +22,11 @@ Rake::TestTask.new("test") { |t|
|
|
22
22
|
|
23
23
|
spec = Gem::Specification.new do |s|
|
24
24
|
s.name = "ankusa"
|
25
|
-
s.version = "0.0.
|
25
|
+
s.version = "0.0.6"
|
26
26
|
s.authors = ["Brian Muller"]
|
27
|
-
s.date = %q{2010-12-
|
28
|
-
s.description = "
|
29
|
-
s.summary = "
|
27
|
+
s.date = %q{2010-12-06}
|
28
|
+
s.description = "Text classifier with HBase storage"
|
29
|
+
s.summary = "Text classifier in Ruby that uses Hadoop's HBase for storage"
|
30
30
|
s.email = "brian.muller@livingsocial.com"
|
31
31
|
s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
|
32
32
|
s.homepage = "https://github.com/livingsocial/ankusa"
|
@@ -88,10 +88,13 @@
|
|
88
88
|
<div class="name-list">
|
89
89
|
<a href="#M000007">classifications</a>
|
90
90
|
<a href="#M000006">classify</a>
|
91
|
-
<a href="#
|
91
|
+
<a href="#M000010">doc_count_totals</a>
|
92
|
+
<a href="#M000009">get_word_probs</a>
|
93
|
+
<a href="#M000008">log_likelihoods</a>
|
92
94
|
<a href="#M000003">new</a>
|
93
95
|
<a href="#M000004">train</a>
|
94
96
|
<a href="#M000005">untrain</a>
|
97
|
+
<a href="#M000011">vocab_sizes</a>
|
95
98
|
</div>
|
96
99
|
</div>
|
97
100
|
|
@@ -158,33 +161,28 @@
|
|
158
161
|
|
159
162
|
<div class="method-heading">
|
160
163
|
<a href="#M000007" class="method-signature">
|
161
|
-
<span class="method-name">classifications</span><span class="method-args">(text)</span>
|
164
|
+
<span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
|
162
165
|
</a>
|
163
166
|
</div>
|
164
167
|
|
165
168
|
<div class="method-description">
|
169
|
+
<p>
|
170
|
+
Classes is an array of classes to look at
|
171
|
+
</p>
|
166
172
|
<p><a class="source-toggle" href="#"
|
167
173
|
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
168
174
|
<div class="method-source-code" id="M000007-source">
|
169
175
|
<pre>
|
170
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
171
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
|
172
|
-
<span class="ruby-identifier">result</span> = <span class="ruby-
|
173
|
-
|
174
|
-
|
175
|
-
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
176
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
176
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 53</span>
|
177
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
178
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
|
179
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
180
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
|
177
181
|
}
|
178
182
|
|
179
|
-
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
180
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
181
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
|
182
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
|
183
|
-
}
|
184
|
-
|
185
183
|
<span class="ruby-comment cmt"># normalize to get probs</span>
|
186
184
|
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
187
|
-
<span class="ruby-
|
185
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
188
186
|
<span class="ruby-identifier">result</span>
|
189
187
|
<span class="ruby-keyword kw">end</span>
|
190
188
|
</pre>
|
@@ -197,7 +195,7 @@
|
|
197
195
|
|
198
196
|
<div class="method-heading">
|
199
197
|
<a href="#M000006" class="method-signature">
|
200
|
-
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
198
|
+
<span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
|
201
199
|
</a>
|
202
200
|
</div>
|
203
201
|
|
@@ -206,10 +204,51 @@
|
|
206
204
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
207
205
|
<div class="method-source-code" id="M000006-source">
|
208
206
|
<pre>
|
209
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
210
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
207
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 47</span>
|
208
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
|
211
209
|
<span class="ruby-comment cmt"># return the most probable class</span>
|
212
|
-
<span class="ruby-identifier">
|
210
|
+
<span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
211
|
+
<span class="ruby-keyword kw">end</span>
|
212
|
+
</pre>
|
213
|
+
</div>
|
214
|
+
</div>
|
215
|
+
</div>
|
216
|
+
|
217
|
+
<div id="method-M000008" class="method-detail">
|
218
|
+
<a name="M000008"></a>
|
219
|
+
|
220
|
+
<div class="method-heading">
|
221
|
+
<a href="#M000008" class="method-signature">
|
222
|
+
<span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
|
223
|
+
</a>
|
224
|
+
</div>
|
225
|
+
|
226
|
+
<div class="method-description">
|
227
|
+
<p>
|
228
|
+
Classes is an array of classes to look at
|
229
|
+
</p>
|
230
|
+
<p><a class="source-toggle" href="#"
|
231
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
232
|
+
<div class="method-source-code" id="M000008-source">
|
233
|
+
<pre>
|
234
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 66</span>
|
235
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
236
|
+
<span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
|
237
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
238
|
+
|
239
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
240
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
241
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
242
|
+
}
|
243
|
+
|
244
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
245
|
+
<span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
|
246
|
+
<span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
|
247
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
248
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
|
249
|
+
}
|
250
|
+
|
251
|
+
<span class="ruby-identifier">result</span>
|
213
252
|
<span class="ruby-keyword kw">end</span>
|
214
253
|
</pre>
|
215
254
|
</div>
|
@@ -244,6 +283,9 @@ text can be either an array of strings or a string klass is a symbol
|
|
244
283
|
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
245
284
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
|
246
285
|
<span class="ruby-ivar">@classnames</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
|
286
|
+
<span class="ruby-comment cmt"># cache is now dirty of these vars</span>
|
287
|
+
<span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
|
288
|
+
<span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
|
247
289
|
<span class="ruby-identifier">th</span>
|
248
290
|
<span class="ruby-keyword kw">end</span>
|
249
291
|
</pre>
|
@@ -268,7 +310,7 @@ text can be either an array of strings or a string klass is a symbol
|
|
268
310
|
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
269
311
|
<div class="method-source-code" id="M000005-source">
|
270
312
|
<pre>
|
271
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
313
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 32</span>
|
272
314
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
273
315
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
274
316
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
@@ -278,6 +320,9 @@ text can be either an array of strings or a string klass is a symbol
|
|
278
320
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
279
321
|
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
280
322
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
|
323
|
+
<span class="ruby-comment cmt"># cache is now dirty of these vars</span>
|
324
|
+
<span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
|
325
|
+
<span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
|
281
326
|
<span class="ruby-identifier">th</span>
|
282
327
|
<span class="ruby-keyword kw">end</span>
|
283
328
|
</pre>
|
@@ -287,26 +332,51 @@ text can be either an array of strings or a string klass is a symbol
|
|
287
332
|
|
288
333
|
<h3 class="section-bar">Protected Instance methods</h3>
|
289
334
|
|
290
|
-
<div id="method-
|
291
|
-
<a name="
|
335
|
+
<div id="method-M000010" class="method-detail">
|
336
|
+
<a name="M000010"></a>
|
292
337
|
|
293
338
|
<div class="method-heading">
|
294
|
-
<a href="#
|
295
|
-
<span class="method-name">
|
339
|
+
<a href="#M000010" class="method-signature">
|
340
|
+
<span class="method-name">doc_count_totals</span><span class="method-args">()</span>
|
296
341
|
</a>
|
297
342
|
</div>
|
298
343
|
|
299
344
|
<div class="method-description">
|
300
345
|
<p><a class="source-toggle" href="#"
|
301
|
-
onclick="toggleCode('
|
302
|
-
<div class="method-source-code" id="
|
346
|
+
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
347
|
+
<div class="method-source-code" id="M000010-source">
|
303
348
|
<pre>
|
304
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
305
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
306
|
-
<span class="ruby-
|
307
|
-
|
349
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 97</span>
|
350
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
|
351
|
+
<span class="ruby-ivar">@doc_count_totals</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_totals</span>
|
352
|
+
<span class="ruby-keyword kw">end</span>
|
353
|
+
</pre>
|
354
|
+
</div>
|
355
|
+
</div>
|
356
|
+
</div>
|
357
|
+
|
358
|
+
<div id="method-M000009" class="method-detail">
|
359
|
+
<a name="M000009"></a>
|
360
|
+
|
361
|
+
<div class="method-heading">
|
362
|
+
<a href="#M000009" class="method-signature">
|
363
|
+
<span class="method-name">get_word_probs</span><span class="method-args">(word, classnames)</span>
|
364
|
+
</a>
|
365
|
+
</div>
|
366
|
+
|
367
|
+
<div class="method-description">
|
368
|
+
<p><a class="source-toggle" href="#"
|
369
|
+
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
370
|
+
<div class="method-source-code" id="M000009-source">
|
371
|
+
<pre>
|
372
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 86</span>
|
373
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
374
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
375
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">v</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }
|
376
|
+
<span class="ruby-identifier">vs</span> = <span class="ruby-identifier">vocab_sizes</span>
|
377
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
308
378
|
<span class="ruby-comment cmt"># use a laplacian smoother</span>
|
309
|
-
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-
|
379
|
+
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-identifier">vs</span>[<span class="ruby-identifier">cn</span>]).<span class="ruby-identifier">to_f</span>
|
310
380
|
}
|
311
381
|
<span class="ruby-identifier">probs</span>
|
312
382
|
<span class="ruby-keyword kw">end</span>
|
@@ -315,6 +385,29 @@ text can be either an array of strings or a string klass is a symbol
|
|
315
385
|
</div>
|
316
386
|
</div>
|
317
387
|
|
388
|
+
<div id="method-M000011" class="method-detail">
|
389
|
+
<a name="M000011"></a>
|
390
|
+
|
391
|
+
<div class="method-heading">
|
392
|
+
<a href="#M000011" class="method-signature">
|
393
|
+
<span class="method-name">vocab_sizes</span><span class="method-args">()</span>
|
394
|
+
</a>
|
395
|
+
</div>
|
396
|
+
|
397
|
+
<div class="method-description">
|
398
|
+
<p><a class="source-toggle" href="#"
|
399
|
+
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
400
|
+
<div class="method-source-code" id="M000011-source">
|
401
|
+
<pre>
|
402
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 101</span>
|
403
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">vocab_sizes</span>
|
404
|
+
<span class="ruby-ivar">@vocab_sizes</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_vocabulary_sizes</span>
|
405
|
+
<span class="ruby-keyword kw">end</span>
|
406
|
+
</pre>
|
407
|
+
</div>
|
408
|
+
</div>
|
409
|
+
</div>
|
410
|
+
|
318
411
|
|
319
412
|
</div>
|
320
413
|
|
@@ -86,21 +86,23 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
95
|
-
<a href="#
|
96
|
-
<a href="#
|
97
|
-
<a href="#M000032">
|
98
|
-
<a href="#M000031">
|
99
|
-
<a href="#
|
100
|
-
<a href="#
|
101
|
-
<a href="#
|
102
|
-
<a href="#
|
103
|
-
<a href="#
|
89
|
+
<a href="#M000027">classnames</a>
|
90
|
+
<a href="#M000039">close</a>
|
91
|
+
<a href="#M000038">doc_count_totals</a>
|
92
|
+
<a href="#M000029">drop_tables</a>
|
93
|
+
<a href="#M000042">freq_table</a>
|
94
|
+
<a href="#M000034">get_doc_count</a>
|
95
|
+
<a href="#M000040">get_summary</a>
|
96
|
+
<a href="#M000033">get_total_word_count</a>
|
97
|
+
<a href="#M000032">get_vocabulary_sizes</a>
|
98
|
+
<a href="#M000031">get_word_counts</a>
|
99
|
+
<a href="#M000037">incr_doc_count</a>
|
100
|
+
<a href="#M000036">incr_total_word_count</a>
|
101
|
+
<a href="#M000035">incr_word_count</a>
|
102
|
+
<a href="#M000030">init_tables</a>
|
103
|
+
<a href="#M000026">new</a>
|
104
|
+
<a href="#M000028">reset</a>
|
105
|
+
<a href="#M000041">summary_table</a>
|
104
106
|
</div>
|
105
107
|
</div>
|
106
108
|
|
@@ -135,19 +137,19 @@
|
|
135
137
|
<div id="methods">
|
136
138
|
<h3 class="section-bar">Public Class methods</h3>
|
137
139
|
|
138
|
-
<div id="method-
|
139
|
-
<a name="
|
140
|
+
<div id="method-M000026" class="method-detail">
|
141
|
+
<a name="M000026"></a>
|
140
142
|
|
141
143
|
<div class="method-heading">
|
142
|
-
<a href="#
|
144
|
+
<a href="#M000026" class="method-signature">
|
143
145
|
<span class="method-name">new</span><span class="method-args">(host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")</span>
|
144
146
|
</a>
|
145
147
|
</div>
|
146
148
|
|
147
149
|
<div class="method-description">
|
148
150
|
<p><a class="source-toggle" href="#"
|
149
|
-
onclick="toggleCode('
|
150
|
-
<div class="method-source-code" id="
|
151
|
+
onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
|
152
|
+
<div class="method-source-code" id="M000026-source">
|
151
153
|
<pre>
|
152
154
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 8</span>
|
153
155
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">host</span>=<span class="ruby-value str">'localhost'</span>, <span class="ruby-identifier">port</span>=<span class="ruby-value">9090</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">"ankusa_word_frequencies"</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">"ankusa_summary"</span>)
|
@@ -165,19 +167,19 @@
|
|
165
167
|
|
166
168
|
<h3 class="section-bar">Public Instance methods</h3>
|
167
169
|
|
168
|
-
<div id="method-
|
169
|
-
<a name="
|
170
|
+
<div id="method-M000027" class="method-detail">
|
171
|
+
<a name="M000027"></a>
|
170
172
|
|
171
173
|
<div class="method-heading">
|
172
|
-
<a href="#
|
174
|
+
<a href="#M000027" class="method-signature">
|
173
175
|
<span class="method-name">classnames</span><span class="method-args">()</span>
|
174
176
|
</a>
|
175
177
|
</div>
|
176
178
|
|
177
179
|
<div class="method-description">
|
178
180
|
<p><a class="source-toggle" href="#"
|
179
|
-
onclick="toggleCode('
|
180
|
-
<div class="method-source-code" id="
|
181
|
+
onclick="toggleCode('M000027-source');return false;">[Source]</a></p>
|
182
|
+
<div class="method-source-code" id="M000027-source">
|
181
183
|
<pre>
|
182
184
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 17</span>
|
183
185
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classnames</span>
|
@@ -192,21 +194,21 @@
|
|
192
194
|
</div>
|
193
195
|
</div>
|
194
196
|
|
195
|
-
<div id="method-
|
196
|
-
<a name="
|
197
|
+
<div id="method-M000039" class="method-detail">
|
198
|
+
<a name="M000039"></a>
|
197
199
|
|
198
200
|
<div class="method-heading">
|
199
|
-
<a href="#
|
201
|
+
<a href="#M000039" class="method-signature">
|
200
202
|
<span class="method-name">close</span><span class="method-args">()</span>
|
201
203
|
</a>
|
202
204
|
</div>
|
203
205
|
|
204
206
|
<div class="method-description">
|
205
207
|
<p><a class="source-toggle" href="#"
|
206
|
-
onclick="toggleCode('
|
207
|
-
<div class="method-source-code" id="
|
208
|
+
onclick="toggleCode('M000039-source');return false;">[Source]</a></p>
|
209
|
+
<div class="method-source-code" id="M000039-source">
|
208
210
|
<pre>
|
209
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
211
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 103</span>
|
210
212
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">close</span>
|
211
213
|
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">close</span>
|
212
214
|
<span class="ruby-keyword kw">end</span>
|
@@ -215,46 +217,42 @@
|
|
215
217
|
</div>
|
216
218
|
</div>
|
217
219
|
|
218
|
-
<div id="method-
|
219
|
-
<a name="
|
220
|
+
<div id="method-M000038" class="method-detail">
|
221
|
+
<a name="M000038"></a>
|
220
222
|
|
221
223
|
<div class="method-heading">
|
222
|
-
<a href="#
|
223
|
-
<span class="method-name">
|
224
|
+
<a href="#M000038" class="method-signature">
|
225
|
+
<span class="method-name">doc_count_totals</span><span class="method-args">()</span>
|
224
226
|
</a>
|
225
227
|
</div>
|
226
228
|
|
227
229
|
<div class="method-description">
|
228
230
|
<p><a class="source-toggle" href="#"
|
229
|
-
onclick="toggleCode('
|
230
|
-
<div class="method-source-code" id="
|
231
|
+
onclick="toggleCode('M000038-source');return false;">[Source]</a></p>
|
232
|
+
<div class="method-source-code" id="M000038-source">
|
231
233
|
<pre>
|
232
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
233
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
234
|
-
<span class="ruby-identifier">
|
235
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-value str">"totals:doccount"</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
236
|
-
<span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">"totals:doccount"</span>].<span class="ruby-identifier">to_i64</span>
|
237
|
-
}
|
238
|
-
<span class="ruby-identifier">total</span>
|
234
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 99</span>
|
235
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
|
236
|
+
<span class="ruby-identifier">get_summary</span> <span class="ruby-value str">"totals:doccount"</span>
|
239
237
|
<span class="ruby-keyword kw">end</span>
|
240
238
|
</pre>
|
241
239
|
</div>
|
242
240
|
</div>
|
243
241
|
</div>
|
244
242
|
|
245
|
-
<div id="method-
|
246
|
-
<a name="
|
243
|
+
<div id="method-M000029" class="method-detail">
|
244
|
+
<a name="M000029"></a>
|
247
245
|
|
248
246
|
<div class="method-heading">
|
249
|
-
<a href="#
|
247
|
+
<a href="#M000029" class="method-signature">
|
250
248
|
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
251
249
|
</a>
|
252
250
|
</div>
|
253
251
|
|
254
252
|
<div class="method-description">
|
255
253
|
<p><a class="source-toggle" href="#"
|
256
|
-
onclick="toggleCode('
|
257
|
-
<div class="method-source-code" id="
|
254
|
+
onclick="toggleCode('M000029-source');return false;">[Source]</a></p>
|
255
|
+
<div class="method-source-code" id="M000029-source">
|
258
256
|
<pre>
|
259
257
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 30</span>
|
260
258
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
@@ -270,21 +268,21 @@
|
|
270
268
|
</div>
|
271
269
|
</div>
|
272
270
|
|
273
|
-
<div id="method-
|
274
|
-
<a name="
|
271
|
+
<div id="method-M000034" class="method-detail">
|
272
|
+
<a name="M000034"></a>
|
275
273
|
|
276
274
|
<div class="method-heading">
|
277
|
-
<a href="#
|
275
|
+
<a href="#M000034" class="method-signature">
|
278
276
|
<span class="method-name">get_doc_count</span><span class="method-args">(klass)</span>
|
279
277
|
</a>
|
280
278
|
</div>
|
281
279
|
|
282
280
|
<div class="method-description">
|
283
281
|
<p><a class="source-toggle" href="#"
|
284
|
-
onclick="toggleCode('
|
285
|
-
<div class="method-source-code" id="
|
282
|
+
onclick="toggleCode('M000034-source');return false;">[Source]</a></p>
|
283
|
+
<div class="method-source-code" id="M000034-source">
|
286
284
|
<pre>
|
287
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
285
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 73</span>
|
288
286
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">klass</span>)
|
289
287
|
<span class="ruby-ivar">@klass_doc_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
|
290
288
|
<span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:doccount"</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
|
@@ -295,21 +293,21 @@
|
|
295
293
|
</div>
|
296
294
|
</div>
|
297
295
|
|
298
|
-
<div id="method-
|
299
|
-
<a name="
|
296
|
+
<div id="method-M000033" class="method-detail">
|
297
|
+
<a name="M000033"></a>
|
300
298
|
|
301
299
|
<div class="method-heading">
|
302
|
-
<a href="#
|
300
|
+
<a href="#M000033" class="method-signature">
|
303
301
|
<span class="method-name">get_total_word_count</span><span class="method-args">(klass)</span>
|
304
302
|
</a>
|
305
303
|
</div>
|
306
304
|
|
307
305
|
<div class="method-description">
|
308
306
|
<p><a class="source-toggle" href="#"
|
309
|
-
onclick="toggleCode('
|
310
|
-
<div class="method-source-code" id="
|
307
|
+
onclick="toggleCode('M000033-source');return false;">[Source]</a></p>
|
308
|
+
<div class="method-source-code" id="M000033-source">
|
311
309
|
<pre>
|
312
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
310
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 67</span>
|
313
311
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">klass</span>)
|
314
312
|
<span class="ruby-ivar">@klass_word_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
|
315
313
|
<span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:wordcount"</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
|
@@ -320,19 +318,42 @@
|
|
320
318
|
</div>
|
321
319
|
</div>
|
322
320
|
|
323
|
-
<div id="method-
|
324
|
-
<a name="
|
321
|
+
<div id="method-M000032" class="method-detail">
|
322
|
+
<a name="M000032"></a>
|
325
323
|
|
326
324
|
<div class="method-heading">
|
327
|
-
<a href="#
|
325
|
+
<a href="#M000032" class="method-signature">
|
326
|
+
<span class="method-name">get_vocabulary_sizes</span><span class="method-args">()</span>
|
327
|
+
</a>
|
328
|
+
</div>
|
329
|
+
|
330
|
+
<div class="method-description">
|
331
|
+
<p><a class="source-toggle" href="#"
|
332
|
+
onclick="toggleCode('M000032-source');return false;">[Source]</a></p>
|
333
|
+
<div class="method-source-code" id="M000032-source">
|
334
|
+
<pre>
|
335
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 63</span>
|
336
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_vocabulary_sizes</span>
|
337
|
+
<span class="ruby-identifier">get_summary</span> <span class="ruby-value str">"totals:vocabsize"</span>
|
338
|
+
<span class="ruby-keyword kw">end</span>
|
339
|
+
</pre>
|
340
|
+
</div>
|
341
|
+
</div>
|
342
|
+
</div>
|
343
|
+
|
344
|
+
<div id="method-M000031" class="method-detail">
|
345
|
+
<a name="M000031"></a>
|
346
|
+
|
347
|
+
<div class="method-heading">
|
348
|
+
<a href="#M000031" class="method-signature">
|
328
349
|
<span class="method-name">get_word_counts</span><span class="method-args">(word)</span>
|
329
350
|
</a>
|
330
351
|
</div>
|
331
352
|
|
332
353
|
<div class="method-description">
|
333
354
|
<p><a class="source-toggle" href="#"
|
334
|
-
onclick="toggleCode('
|
335
|
-
<div class="method-source-code" id="
|
355
|
+
onclick="toggleCode('M000031-source');return false;">[Source]</a></p>
|
356
|
+
<div class="method-source-code" id="M000031-source">
|
336
357
|
<pre>
|
337
358
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 49</span>
|
338
359
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
|
@@ -342,7 +363,8 @@
|
|
342
363
|
|
343
364
|
<span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
|
344
365
|
<span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
|
345
|
-
<span class="ruby-
|
366
|
+
<span class="ruby-comment cmt"># in case untrain has been called too many times</span>
|
367
|
+
<span class="ruby-identifier">counts</span>[<span class="ruby-identifier">classname</span>] = [<span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>, <span class="ruby-value">0</span>].<span class="ruby-identifier">max</span>
|
346
368
|
}
|
347
369
|
|
348
370
|
<span class="ruby-identifier">counts</span>
|
@@ -352,21 +374,21 @@
|
|
352
374
|
</div>
|
353
375
|
</div>
|
354
376
|
|
355
|
-
<div id="method-
|
356
|
-
<a name="
|
377
|
+
<div id="method-M000037" class="method-detail">
|
378
|
+
<a name="M000037"></a>
|
357
379
|
|
358
380
|
<div class="method-heading">
|
359
|
-
<a href="#
|
381
|
+
<a href="#M000037" class="method-signature">
|
360
382
|
<span class="method-name">incr_doc_count</span><span class="method-args">(klass, count)</span>
|
361
383
|
</a>
|
362
384
|
</div>
|
363
385
|
|
364
386
|
<div class="method-description">
|
365
387
|
<p><a class="source-toggle" href="#"
|
366
|
-
onclick="toggleCode('
|
367
|
-
<div class="method-source-code" id="
|
388
|
+
onclick="toggleCode('M000037-source');return false;">[Source]</a></p>
|
389
|
+
<div class="method-source-code" id="M000037-source">
|
368
390
|
<pre>
|
369
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
391
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 95</span>
|
370
392
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_doc_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
|
371
393
|
<span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:doccount"</span>, <span class="ruby-identifier">count</span>
|
372
394
|
<span class="ruby-keyword kw">end</span>
|
@@ -375,21 +397,21 @@
|
|
375
397
|
</div>
|
376
398
|
</div>
|
377
399
|
|
378
|
-
<div id="method-
|
379
|
-
<a name="
|
400
|
+
<div id="method-M000036" class="method-detail">
|
401
|
+
<a name="M000036"></a>
|
380
402
|
|
381
403
|
<div class="method-heading">
|
382
|
-
<a href="#
|
404
|
+
<a href="#M000036" class="method-signature">
|
383
405
|
<span class="method-name">incr_total_word_count</span><span class="method-args">(klass, count)</span>
|
384
406
|
</a>
|
385
407
|
</div>
|
386
408
|
|
387
409
|
<div class="method-description">
|
388
410
|
<p><a class="source-toggle" href="#"
|
389
|
-
onclick="toggleCode('
|
390
|
-
<div class="method-source-code" id="
|
411
|
+
onclick="toggleCode('M000036-source');return false;">[Source]</a></p>
|
412
|
+
<div class="method-source-code" id="M000036-source">
|
391
413
|
<pre>
|
392
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
414
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 91</span>
|
393
415
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_total_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
|
394
416
|
<span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:wordcount"</span>, <span class="ruby-identifier">count</span>
|
395
417
|
<span class="ruby-keyword kw">end</span>
|
@@ -398,42 +420,50 @@
|
|
398
420
|
</div>
|
399
421
|
</div>
|
400
422
|
|
401
|
-
<div id="method-
|
402
|
-
<a name="
|
423
|
+
<div id="method-M000035" class="method-detail">
|
424
|
+
<a name="M000035"></a>
|
403
425
|
|
404
426
|
<div class="method-heading">
|
405
|
-
<a href="#
|
427
|
+
<a href="#M000035" class="method-signature">
|
406
428
|
<span class="method-name">incr_word_count</span><span class="method-args">(klass, word, count)</span>
|
407
429
|
</a>
|
408
430
|
</div>
|
409
431
|
|
410
432
|
<div class="method-description">
|
411
433
|
<p><a class="source-toggle" href="#"
|
412
|
-
onclick="toggleCode('
|
413
|
-
<div class="method-source-code" id="
|
434
|
+
onclick="toggleCode('M000035-source');return false;">[Source]</a></p>
|
435
|
+
<div class="method-source-code" id="M000035-source">
|
414
436
|
<pre>
|
415
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
437
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 79</span>
|
416
438
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>)
|
417
|
-
<span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">"classes:#{klass.to_s}"</span>, <span class="ruby-identifier">count</span>
|
439
|
+
<span class="ruby-identifier">size</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">"classes:#{klass.to_s}"</span>, <span class="ruby-identifier">count</span>
|
440
|
+
<span class="ruby-comment cmt"># if this is a new word, increase the klass's vocab size. If the new word</span>
|
441
|
+
<span class="ruby-comment cmt"># count is 0, then we need to decrement our vocab size</span>
|
442
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">count</span>
|
443
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:vocabsize"</span>
|
444
|
+
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
|
445
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:vocabsize"</span>, <span class="ruby-value">-1</span>
|
446
|
+
<span class="ruby-keyword kw">end</span>
|
447
|
+
<span class="ruby-identifier">size</span>
|
418
448
|
<span class="ruby-keyword kw">end</span>
|
419
449
|
</pre>
|
420
450
|
</div>
|
421
451
|
</div>
|
422
452
|
</div>
|
423
453
|
|
424
|
-
<div id="method-
|
425
|
-
<a name="
|
454
|
+
<div id="method-M000030" class="method-detail">
|
455
|
+
<a name="M000030"></a>
|
426
456
|
|
427
457
|
<div class="method-heading">
|
428
|
-
<a href="#
|
458
|
+
<a href="#M000030" class="method-signature">
|
429
459
|
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
430
460
|
</a>
|
431
461
|
</div>
|
432
462
|
|
433
463
|
<div class="method-description">
|
434
464
|
<p><a class="source-toggle" href="#"
|
435
|
-
onclick="toggleCode('
|
436
|
-
<div class="method-source-code" id="
|
465
|
+
onclick="toggleCode('M000030-source');return false;">[Source]</a></p>
|
466
|
+
<div class="method-source-code" id="M000030-source">
|
437
467
|
<pre>
|
438
468
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 39</span>
|
439
469
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
@@ -450,19 +480,19 @@
|
|
450
480
|
</div>
|
451
481
|
</div>
|
452
482
|
|
453
|
-
<div id="method-
|
454
|
-
<a name="
|
483
|
+
<div id="method-M000028" class="method-detail">
|
484
|
+
<a name="M000028"></a>
|
455
485
|
|
456
486
|
<div class="method-heading">
|
457
|
-
<a href="#
|
487
|
+
<a href="#M000028" class="method-signature">
|
458
488
|
<span class="method-name">reset</span><span class="method-args">()</span>
|
459
489
|
</a>
|
460
490
|
</div>
|
461
491
|
|
462
492
|
<div class="method-description">
|
463
493
|
<p><a class="source-toggle" href="#"
|
464
|
-
onclick="toggleCode('
|
465
|
-
<div class="method-source-code" id="
|
494
|
+
onclick="toggleCode('M000028-source');return false;">[Source]</a></p>
|
495
|
+
<div class="method-source-code" id="M000028-source">
|
466
496
|
<pre>
|
467
497
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 25</span>
|
468
498
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
@@ -476,21 +506,21 @@
|
|
476
506
|
|
477
507
|
<h3 class="section-bar">Protected Instance methods</h3>
|
478
508
|
|
479
|
-
<div id="method-
|
480
|
-
<a name="
|
509
|
+
<div id="method-M000042" class="method-detail">
|
510
|
+
<a name="M000042"></a>
|
481
511
|
|
482
512
|
<div class="method-heading">
|
483
|
-
<a href="#
|
513
|
+
<a href="#M000042" class="method-signature">
|
484
514
|
<span class="method-name">freq_table</span><span class="method-args">()</span>
|
485
515
|
</a>
|
486
516
|
</div>
|
487
517
|
|
488
518
|
<div class="method-description">
|
489
519
|
<p><a class="source-toggle" href="#"
|
490
|
-
onclick="toggleCode('
|
491
|
-
<div class="method-source-code" id="
|
520
|
+
onclick="toggleCode('M000042-source');return false;">[Source]</a></p>
|
521
|
+
<div class="method-source-code" id="M000042-source">
|
492
522
|
<pre>
|
493
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
523
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 120</span>
|
494
524
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
|
495
525
|
<span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
|
496
526
|
<span class="ruby-keyword kw">end</span>
|
@@ -499,21 +529,48 @@
|
|
499
529
|
</div>
|
500
530
|
</div>
|
501
531
|
|
502
|
-
<div id="method-
|
503
|
-
<a name="
|
532
|
+
<div id="method-M000040" class="method-detail">
|
533
|
+
<a name="M000040"></a>
|
504
534
|
|
505
535
|
<div class="method-heading">
|
506
|
-
<a href="#
|
536
|
+
<a href="#M000040" class="method-signature">
|
537
|
+
<span class="method-name">get_summary</span><span class="method-args">(name)</span>
|
538
|
+
</a>
|
539
|
+
</div>
|
540
|
+
|
541
|
+
<div class="method-description">
|
542
|
+
<p><a class="source-toggle" href="#"
|
543
|
+
onclick="toggleCode('M000040-source');return false;">[Source]</a></p>
|
544
|
+
<div class="method-source-code" id="M000040-source">
|
545
|
+
<pre>
|
546
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 108</span>
|
547
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_summary</span>(<span class="ruby-identifier">name</span>)
|
548
|
+
<span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
549
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-identifier">name</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
550
|
+
<span class="ruby-identifier">counts</span>[<span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>] = <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-identifier">name</span>].<span class="ruby-identifier">to_i64</span>
|
551
|
+
}
|
552
|
+
<span class="ruby-identifier">counts</span>
|
553
|
+
<span class="ruby-keyword kw">end</span>
|
554
|
+
</pre>
|
555
|
+
</div>
|
556
|
+
</div>
|
557
|
+
</div>
|
558
|
+
|
559
|
+
<div id="method-M000041" class="method-detail">
|
560
|
+
<a name="M000041"></a>
|
561
|
+
|
562
|
+
<div class="method-heading">
|
563
|
+
<a href="#M000041" class="method-signature">
|
507
564
|
<span class="method-name">summary_table</span><span class="method-args">()</span>
|
508
565
|
</a>
|
509
566
|
</div>
|
510
567
|
|
511
568
|
<div class="method-description">
|
512
569
|
<p><a class="source-toggle" href="#"
|
513
|
-
onclick="toggleCode('
|
514
|
-
<div class="method-source-code" id="
|
570
|
+
onclick="toggleCode('M000041-source');return false;">[Source]</a></p>
|
571
|
+
<div class="method-source-code" id="M000041-source">
|
515
572
|
<pre>
|
516
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
573
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 116</span>
|
517
574
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
|
518
575
|
<span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
|
519
576
|
<span class="ruby-keyword kw">end</span>
|