ankusa 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +10 -1
- data/Rakefile +4 -4
- data/docs/classes/Ankusa/Classifier.html +125 -32
- data/docs/classes/Ankusa/HBaseStorage.html +165 -108
- data/docs/classes/Ankusa/MemoryStorage.html +117 -89
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +16 -3
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +1 -1
- data/docs/files/lib/ankusa/memory_storage_rb.html +1 -1
- data/docs/fr_method_index.html +40 -34
- data/lib/ankusa/classifier.rb +44 -15
- data/lib/ankusa/hbase_storage.rb +25 -8
- data/lib/ankusa/memory_storage.rb +10 -2
- metadata +6 -6
data/README.rdoc
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
= ankusa
|
2
2
|
|
3
|
-
Ankusa is a
|
3
|
+
Ankusa is a text classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size.
|
4
|
+
|
5
|
+
Ankusa currently uses a Naive Bayes classifier. It ignores common words (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian smoothing in the classification method.
|
4
6
|
|
5
7
|
== Installation
|
6
8
|
First, install HBase / Hadoop. Make sure the HBase Thrift interface has been started as well. Then:
|
@@ -15,6 +17,8 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
15
17
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
16
18
|
c = Ankusa::Classifier.new storage
|
17
19
|
|
20
|
+
# Each of these calls will return a bag-of-words
|
21
|
+
# has with stemmed words as keys and counts as values
|
18
22
|
c.train :spam, "This is some spammy text"
|
19
23
|
c.train :good, "This is not the bad stuff"
|
20
24
|
|
@@ -25,6 +29,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
25
29
|
# membership probability as values
|
26
30
|
puts c.classifications "This is some spammy text"
|
27
31
|
|
32
|
+
# If you have a large corpus, the probabilities will
|
33
|
+
# likely all be 0. In that case, you must use log
|
34
|
+
# likelihood values
|
35
|
+
puts c.log_likelihoods "This is some spammy text"
|
36
|
+
|
28
37
|
# get a list of all classes
|
29
38
|
puts c.classes
|
30
39
|
|
data/Rakefile
CHANGED
@@ -22,11 +22,11 @@ Rake::TestTask.new("test") { |t|
|
|
22
22
|
|
23
23
|
spec = Gem::Specification.new do |s|
|
24
24
|
s.name = "ankusa"
|
25
|
-
s.version = "0.0.
|
25
|
+
s.version = "0.0.6"
|
26
26
|
s.authors = ["Brian Muller"]
|
27
|
-
s.date = %q{2010-12-
|
28
|
-
s.description = "
|
29
|
-
s.summary = "
|
27
|
+
s.date = %q{2010-12-06}
|
28
|
+
s.description = "Text classifier with HBase storage"
|
29
|
+
s.summary = "Text classifier in Ruby that uses Hadoop's HBase for storage"
|
30
30
|
s.email = "brian.muller@livingsocial.com"
|
31
31
|
s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
|
32
32
|
s.homepage = "https://github.com/livingsocial/ankusa"
|
@@ -88,10 +88,13 @@
|
|
88
88
|
<div class="name-list">
|
89
89
|
<a href="#M000007">classifications</a>
|
90
90
|
<a href="#M000006">classify</a>
|
91
|
-
<a href="#
|
91
|
+
<a href="#M000010">doc_count_totals</a>
|
92
|
+
<a href="#M000009">get_word_probs</a>
|
93
|
+
<a href="#M000008">log_likelihoods</a>
|
92
94
|
<a href="#M000003">new</a>
|
93
95
|
<a href="#M000004">train</a>
|
94
96
|
<a href="#M000005">untrain</a>
|
97
|
+
<a href="#M000011">vocab_sizes</a>
|
95
98
|
</div>
|
96
99
|
</div>
|
97
100
|
|
@@ -158,33 +161,28 @@
|
|
158
161
|
|
159
162
|
<div class="method-heading">
|
160
163
|
<a href="#M000007" class="method-signature">
|
161
|
-
<span class="method-name">classifications</span><span class="method-args">(text)</span>
|
164
|
+
<span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
|
162
165
|
</a>
|
163
166
|
</div>
|
164
167
|
|
165
168
|
<div class="method-description">
|
169
|
+
<p>
|
170
|
+
Classes is an array of classes to look at
|
171
|
+
</p>
|
166
172
|
<p><a class="source-toggle" href="#"
|
167
173
|
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
168
174
|
<div class="method-source-code" id="M000007-source">
|
169
175
|
<pre>
|
170
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
171
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
|
172
|
-
<span class="ruby-identifier">result</span> = <span class="ruby-
|
173
|
-
|
174
|
-
|
175
|
-
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
176
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
176
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 53</span>
|
177
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
178
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
|
179
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
180
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
|
177
181
|
}
|
178
182
|
|
179
|
-
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
180
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
181
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
|
182
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
|
183
|
-
}
|
184
|
-
|
185
183
|
<span class="ruby-comment cmt"># normalize to get probs</span>
|
186
184
|
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
187
|
-
<span class="ruby-
|
185
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
188
186
|
<span class="ruby-identifier">result</span>
|
189
187
|
<span class="ruby-keyword kw">end</span>
|
190
188
|
</pre>
|
@@ -197,7 +195,7 @@
|
|
197
195
|
|
198
196
|
<div class="method-heading">
|
199
197
|
<a href="#M000006" class="method-signature">
|
200
|
-
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
198
|
+
<span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
|
201
199
|
</a>
|
202
200
|
</div>
|
203
201
|
|
@@ -206,10 +204,51 @@
|
|
206
204
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
207
205
|
<div class="method-source-code" id="M000006-source">
|
208
206
|
<pre>
|
209
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
210
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
207
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 47</span>
|
208
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
|
211
209
|
<span class="ruby-comment cmt"># return the most probable class</span>
|
212
|
-
<span class="ruby-identifier">
|
210
|
+
<span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
211
|
+
<span class="ruby-keyword kw">end</span>
|
212
|
+
</pre>
|
213
|
+
</div>
|
214
|
+
</div>
|
215
|
+
</div>
|
216
|
+
|
217
|
+
<div id="method-M000008" class="method-detail">
|
218
|
+
<a name="M000008"></a>
|
219
|
+
|
220
|
+
<div class="method-heading">
|
221
|
+
<a href="#M000008" class="method-signature">
|
222
|
+
<span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
|
223
|
+
</a>
|
224
|
+
</div>
|
225
|
+
|
226
|
+
<div class="method-description">
|
227
|
+
<p>
|
228
|
+
Classes is an array of classes to look at
|
229
|
+
</p>
|
230
|
+
<p><a class="source-toggle" href="#"
|
231
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
232
|
+
<div class="method-source-code" id="M000008-source">
|
233
|
+
<pre>
|
234
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 66</span>
|
235
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
236
|
+
<span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
|
237
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
238
|
+
|
239
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
240
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
241
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
242
|
+
}
|
243
|
+
|
244
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
245
|
+
<span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
|
246
|
+
<span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
|
247
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
248
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
|
249
|
+
}
|
250
|
+
|
251
|
+
<span class="ruby-identifier">result</span>
|
213
252
|
<span class="ruby-keyword kw">end</span>
|
214
253
|
</pre>
|
215
254
|
</div>
|
@@ -244,6 +283,9 @@ text can be either an array of strings or a string klass is a symbol
|
|
244
283
|
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
245
284
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
|
246
285
|
<span class="ruby-ivar">@classnames</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
|
286
|
+
<span class="ruby-comment cmt"># cache is now dirty of these vars</span>
|
287
|
+
<span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
|
288
|
+
<span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
|
247
289
|
<span class="ruby-identifier">th</span>
|
248
290
|
<span class="ruby-keyword kw">end</span>
|
249
291
|
</pre>
|
@@ -268,7 +310,7 @@ text can be either an array of strings or a string klass is a symbol
|
|
268
310
|
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
269
311
|
<div class="method-source-code" id="M000005-source">
|
270
312
|
<pre>
|
271
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
313
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 32</span>
|
272
314
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
273
315
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
274
316
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
@@ -278,6 +320,9 @@ text can be either an array of strings or a string klass is a symbol
|
|
278
320
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
279
321
|
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
280
322
|
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
|
323
|
+
<span class="ruby-comment cmt"># cache is now dirty of these vars</span>
|
324
|
+
<span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
|
325
|
+
<span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
|
281
326
|
<span class="ruby-identifier">th</span>
|
282
327
|
<span class="ruby-keyword kw">end</span>
|
283
328
|
</pre>
|
@@ -287,26 +332,51 @@ text can be either an array of strings or a string klass is a symbol
|
|
287
332
|
|
288
333
|
<h3 class="section-bar">Protected Instance methods</h3>
|
289
334
|
|
290
|
-
<div id="method-
|
291
|
-
<a name="
|
335
|
+
<div id="method-M000010" class="method-detail">
|
336
|
+
<a name="M000010"></a>
|
292
337
|
|
293
338
|
<div class="method-heading">
|
294
|
-
<a href="#
|
295
|
-
<span class="method-name">
|
339
|
+
<a href="#M000010" class="method-signature">
|
340
|
+
<span class="method-name">doc_count_totals</span><span class="method-args">()</span>
|
296
341
|
</a>
|
297
342
|
</div>
|
298
343
|
|
299
344
|
<div class="method-description">
|
300
345
|
<p><a class="source-toggle" href="#"
|
301
|
-
onclick="toggleCode('
|
302
|
-
<div class="method-source-code" id="
|
346
|
+
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
347
|
+
<div class="method-source-code" id="M000010-source">
|
303
348
|
<pre>
|
304
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
305
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
306
|
-
<span class="ruby-
|
307
|
-
|
349
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 97</span>
|
350
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
|
351
|
+
<span class="ruby-ivar">@doc_count_totals</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_totals</span>
|
352
|
+
<span class="ruby-keyword kw">end</span>
|
353
|
+
</pre>
|
354
|
+
</div>
|
355
|
+
</div>
|
356
|
+
</div>
|
357
|
+
|
358
|
+
<div id="method-M000009" class="method-detail">
|
359
|
+
<a name="M000009"></a>
|
360
|
+
|
361
|
+
<div class="method-heading">
|
362
|
+
<a href="#M000009" class="method-signature">
|
363
|
+
<span class="method-name">get_word_probs</span><span class="method-args">(word, classnames)</span>
|
364
|
+
</a>
|
365
|
+
</div>
|
366
|
+
|
367
|
+
<div class="method-description">
|
368
|
+
<p><a class="source-toggle" href="#"
|
369
|
+
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
370
|
+
<div class="method-source-code" id="M000009-source">
|
371
|
+
<pre>
|
372
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 86</span>
|
373
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
374
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
375
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">v</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }
|
376
|
+
<span class="ruby-identifier">vs</span> = <span class="ruby-identifier">vocab_sizes</span>
|
377
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
308
378
|
<span class="ruby-comment cmt"># use a laplacian smoother</span>
|
309
|
-
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-
|
379
|
+
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-identifier">vs</span>[<span class="ruby-identifier">cn</span>]).<span class="ruby-identifier">to_f</span>
|
310
380
|
}
|
311
381
|
<span class="ruby-identifier">probs</span>
|
312
382
|
<span class="ruby-keyword kw">end</span>
|
@@ -315,6 +385,29 @@ text can be either an array of strings or a string klass is a symbol
|
|
315
385
|
</div>
|
316
386
|
</div>
|
317
387
|
|
388
|
+
<div id="method-M000011" class="method-detail">
|
389
|
+
<a name="M000011"></a>
|
390
|
+
|
391
|
+
<div class="method-heading">
|
392
|
+
<a href="#M000011" class="method-signature">
|
393
|
+
<span class="method-name">vocab_sizes</span><span class="method-args">()</span>
|
394
|
+
</a>
|
395
|
+
</div>
|
396
|
+
|
397
|
+
<div class="method-description">
|
398
|
+
<p><a class="source-toggle" href="#"
|
399
|
+
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
400
|
+
<div class="method-source-code" id="M000011-source">
|
401
|
+
<pre>
|
402
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 101</span>
|
403
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">vocab_sizes</span>
|
404
|
+
<span class="ruby-ivar">@vocab_sizes</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_vocabulary_sizes</span>
|
405
|
+
<span class="ruby-keyword kw">end</span>
|
406
|
+
</pre>
|
407
|
+
</div>
|
408
|
+
</div>
|
409
|
+
</div>
|
410
|
+
|
318
411
|
|
319
412
|
</div>
|
320
413
|
|
@@ -86,21 +86,23 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
95
|
-
<a href="#
|
96
|
-
<a href="#
|
97
|
-
<a href="#M000032">
|
98
|
-
<a href="#M000031">
|
99
|
-
<a href="#
|
100
|
-
<a href="#
|
101
|
-
<a href="#
|
102
|
-
<a href="#
|
103
|
-
<a href="#
|
89
|
+
<a href="#M000027">classnames</a>
|
90
|
+
<a href="#M000039">close</a>
|
91
|
+
<a href="#M000038">doc_count_totals</a>
|
92
|
+
<a href="#M000029">drop_tables</a>
|
93
|
+
<a href="#M000042">freq_table</a>
|
94
|
+
<a href="#M000034">get_doc_count</a>
|
95
|
+
<a href="#M000040">get_summary</a>
|
96
|
+
<a href="#M000033">get_total_word_count</a>
|
97
|
+
<a href="#M000032">get_vocabulary_sizes</a>
|
98
|
+
<a href="#M000031">get_word_counts</a>
|
99
|
+
<a href="#M000037">incr_doc_count</a>
|
100
|
+
<a href="#M000036">incr_total_word_count</a>
|
101
|
+
<a href="#M000035">incr_word_count</a>
|
102
|
+
<a href="#M000030">init_tables</a>
|
103
|
+
<a href="#M000026">new</a>
|
104
|
+
<a href="#M000028">reset</a>
|
105
|
+
<a href="#M000041">summary_table</a>
|
104
106
|
</div>
|
105
107
|
</div>
|
106
108
|
|
@@ -135,19 +137,19 @@
|
|
135
137
|
<div id="methods">
|
136
138
|
<h3 class="section-bar">Public Class methods</h3>
|
137
139
|
|
138
|
-
<div id="method-
|
139
|
-
<a name="
|
140
|
+
<div id="method-M000026" class="method-detail">
|
141
|
+
<a name="M000026"></a>
|
140
142
|
|
141
143
|
<div class="method-heading">
|
142
|
-
<a href="#
|
144
|
+
<a href="#M000026" class="method-signature">
|
143
145
|
<span class="method-name">new</span><span class="method-args">(host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")</span>
|
144
146
|
</a>
|
145
147
|
</div>
|
146
148
|
|
147
149
|
<div class="method-description">
|
148
150
|
<p><a class="source-toggle" href="#"
|
149
|
-
onclick="toggleCode('
|
150
|
-
<div class="method-source-code" id="
|
151
|
+
onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
|
152
|
+
<div class="method-source-code" id="M000026-source">
|
151
153
|
<pre>
|
152
154
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 8</span>
|
153
155
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">host</span>=<span class="ruby-value str">'localhost'</span>, <span class="ruby-identifier">port</span>=<span class="ruby-value">9090</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">"ankusa_word_frequencies"</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">"ankusa_summary"</span>)
|
@@ -165,19 +167,19 @@
|
|
165
167
|
|
166
168
|
<h3 class="section-bar">Public Instance methods</h3>
|
167
169
|
|
168
|
-
<div id="method-
|
169
|
-
<a name="
|
170
|
+
<div id="method-M000027" class="method-detail">
|
171
|
+
<a name="M000027"></a>
|
170
172
|
|
171
173
|
<div class="method-heading">
|
172
|
-
<a href="#
|
174
|
+
<a href="#M000027" class="method-signature">
|
173
175
|
<span class="method-name">classnames</span><span class="method-args">()</span>
|
174
176
|
</a>
|
175
177
|
</div>
|
176
178
|
|
177
179
|
<div class="method-description">
|
178
180
|
<p><a class="source-toggle" href="#"
|
179
|
-
onclick="toggleCode('
|
180
|
-
<div class="method-source-code" id="
|
181
|
+
onclick="toggleCode('M000027-source');return false;">[Source]</a></p>
|
182
|
+
<div class="method-source-code" id="M000027-source">
|
181
183
|
<pre>
|
182
184
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 17</span>
|
183
185
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classnames</span>
|
@@ -192,21 +194,21 @@
|
|
192
194
|
</div>
|
193
195
|
</div>
|
194
196
|
|
195
|
-
<div id="method-
|
196
|
-
<a name="
|
197
|
+
<div id="method-M000039" class="method-detail">
|
198
|
+
<a name="M000039"></a>
|
197
199
|
|
198
200
|
<div class="method-heading">
|
199
|
-
<a href="#
|
201
|
+
<a href="#M000039" class="method-signature">
|
200
202
|
<span class="method-name">close</span><span class="method-args">()</span>
|
201
203
|
</a>
|
202
204
|
</div>
|
203
205
|
|
204
206
|
<div class="method-description">
|
205
207
|
<p><a class="source-toggle" href="#"
|
206
|
-
onclick="toggleCode('
|
207
|
-
<div class="method-source-code" id="
|
208
|
+
onclick="toggleCode('M000039-source');return false;">[Source]</a></p>
|
209
|
+
<div class="method-source-code" id="M000039-source">
|
208
210
|
<pre>
|
209
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
211
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 103</span>
|
210
212
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">close</span>
|
211
213
|
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">close</span>
|
212
214
|
<span class="ruby-keyword kw">end</span>
|
@@ -215,46 +217,42 @@
|
|
215
217
|
</div>
|
216
218
|
</div>
|
217
219
|
|
218
|
-
<div id="method-
|
219
|
-
<a name="
|
220
|
+
<div id="method-M000038" class="method-detail">
|
221
|
+
<a name="M000038"></a>
|
220
222
|
|
221
223
|
<div class="method-heading">
|
222
|
-
<a href="#
|
223
|
-
<span class="method-name">
|
224
|
+
<a href="#M000038" class="method-signature">
|
225
|
+
<span class="method-name">doc_count_totals</span><span class="method-args">()</span>
|
224
226
|
</a>
|
225
227
|
</div>
|
226
228
|
|
227
229
|
<div class="method-description">
|
228
230
|
<p><a class="source-toggle" href="#"
|
229
|
-
onclick="toggleCode('
|
230
|
-
<div class="method-source-code" id="
|
231
|
+
onclick="toggleCode('M000038-source');return false;">[Source]</a></p>
|
232
|
+
<div class="method-source-code" id="M000038-source">
|
231
233
|
<pre>
|
232
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
233
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
234
|
-
<span class="ruby-identifier">
|
235
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-value str">"totals:doccount"</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
236
|
-
<span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">"totals:doccount"</span>].<span class="ruby-identifier">to_i64</span>
|
237
|
-
}
|
238
|
-
<span class="ruby-identifier">total</span>
|
234
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 99</span>
|
235
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
|
236
|
+
<span class="ruby-identifier">get_summary</span> <span class="ruby-value str">"totals:doccount"</span>
|
239
237
|
<span class="ruby-keyword kw">end</span>
|
240
238
|
</pre>
|
241
239
|
</div>
|
242
240
|
</div>
|
243
241
|
</div>
|
244
242
|
|
245
|
-
<div id="method-
|
246
|
-
<a name="
|
243
|
+
<div id="method-M000029" class="method-detail">
|
244
|
+
<a name="M000029"></a>
|
247
245
|
|
248
246
|
<div class="method-heading">
|
249
|
-
<a href="#
|
247
|
+
<a href="#M000029" class="method-signature">
|
250
248
|
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
251
249
|
</a>
|
252
250
|
</div>
|
253
251
|
|
254
252
|
<div class="method-description">
|
255
253
|
<p><a class="source-toggle" href="#"
|
256
|
-
onclick="toggleCode('
|
257
|
-
<div class="method-source-code" id="
|
254
|
+
onclick="toggleCode('M000029-source');return false;">[Source]</a></p>
|
255
|
+
<div class="method-source-code" id="M000029-source">
|
258
256
|
<pre>
|
259
257
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 30</span>
|
260
258
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
@@ -270,21 +268,21 @@
|
|
270
268
|
</div>
|
271
269
|
</div>
|
272
270
|
|
273
|
-
<div id="method-
|
274
|
-
<a name="
|
271
|
+
<div id="method-M000034" class="method-detail">
|
272
|
+
<a name="M000034"></a>
|
275
273
|
|
276
274
|
<div class="method-heading">
|
277
|
-
<a href="#
|
275
|
+
<a href="#M000034" class="method-signature">
|
278
276
|
<span class="method-name">get_doc_count</span><span class="method-args">(klass)</span>
|
279
277
|
</a>
|
280
278
|
</div>
|
281
279
|
|
282
280
|
<div class="method-description">
|
283
281
|
<p><a class="source-toggle" href="#"
|
284
|
-
onclick="toggleCode('
|
285
|
-
<div class="method-source-code" id="
|
282
|
+
onclick="toggleCode('M000034-source');return false;">[Source]</a></p>
|
283
|
+
<div class="method-source-code" id="M000034-source">
|
286
284
|
<pre>
|
287
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
285
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 73</span>
|
288
286
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">klass</span>)
|
289
287
|
<span class="ruby-ivar">@klass_doc_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
|
290
288
|
<span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:doccount"</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
|
@@ -295,21 +293,21 @@
|
|
295
293
|
</div>
|
296
294
|
</div>
|
297
295
|
|
298
|
-
<div id="method-
|
299
|
-
<a name="
|
296
|
+
<div id="method-M000033" class="method-detail">
|
297
|
+
<a name="M000033"></a>
|
300
298
|
|
301
299
|
<div class="method-heading">
|
302
|
-
<a href="#
|
300
|
+
<a href="#M000033" class="method-signature">
|
303
301
|
<span class="method-name">get_total_word_count</span><span class="method-args">(klass)</span>
|
304
302
|
</a>
|
305
303
|
</div>
|
306
304
|
|
307
305
|
<div class="method-description">
|
308
306
|
<p><a class="source-toggle" href="#"
|
309
|
-
onclick="toggleCode('
|
310
|
-
<div class="method-source-code" id="
|
307
|
+
onclick="toggleCode('M000033-source');return false;">[Source]</a></p>
|
308
|
+
<div class="method-source-code" id="M000033-source">
|
311
309
|
<pre>
|
312
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
310
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 67</span>
|
313
311
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">klass</span>)
|
314
312
|
<span class="ruby-ivar">@klass_word_counts</span>.<span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">klass</span>) {
|
315
313
|
<span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:wordcount"</span>).<span class="ruby-identifier">first</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>
|
@@ -320,19 +318,42 @@
|
|
320
318
|
</div>
|
321
319
|
</div>
|
322
320
|
|
323
|
-
<div id="method-
|
324
|
-
<a name="
|
321
|
+
<div id="method-M000032" class="method-detail">
|
322
|
+
<a name="M000032"></a>
|
325
323
|
|
326
324
|
<div class="method-heading">
|
327
|
-
<a href="#
|
325
|
+
<a href="#M000032" class="method-signature">
|
326
|
+
<span class="method-name">get_vocabulary_sizes</span><span class="method-args">()</span>
|
327
|
+
</a>
|
328
|
+
</div>
|
329
|
+
|
330
|
+
<div class="method-description">
|
331
|
+
<p><a class="source-toggle" href="#"
|
332
|
+
onclick="toggleCode('M000032-source');return false;">[Source]</a></p>
|
333
|
+
<div class="method-source-code" id="M000032-source">
|
334
|
+
<pre>
|
335
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 63</span>
|
336
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_vocabulary_sizes</span>
|
337
|
+
<span class="ruby-identifier">get_summary</span> <span class="ruby-value str">"totals:vocabsize"</span>
|
338
|
+
<span class="ruby-keyword kw">end</span>
|
339
|
+
</pre>
|
340
|
+
</div>
|
341
|
+
</div>
|
342
|
+
</div>
|
343
|
+
|
344
|
+
<div id="method-M000031" class="method-detail">
|
345
|
+
<a name="M000031"></a>
|
346
|
+
|
347
|
+
<div class="method-heading">
|
348
|
+
<a href="#M000031" class="method-signature">
|
328
349
|
<span class="method-name">get_word_counts</span><span class="method-args">(word)</span>
|
329
350
|
</a>
|
330
351
|
</div>
|
331
352
|
|
332
353
|
<div class="method-description">
|
333
354
|
<p><a class="source-toggle" href="#"
|
334
|
-
onclick="toggleCode('
|
335
|
-
<div class="method-source-code" id="
|
355
|
+
onclick="toggleCode('M000031-source');return false;">[Source]</a></p>
|
356
|
+
<div class="method-source-code" id="M000031-source">
|
336
357
|
<pre>
|
337
358
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 49</span>
|
338
359
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
|
@@ -342,7 +363,8 @@
|
|
342
363
|
|
343
364
|
<span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
|
344
365
|
<span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
|
345
|
-
<span class="ruby-
|
366
|
+
<span class="ruby-comment cmt"># in case untrain has been called too many times</span>
|
367
|
+
<span class="ruby-identifier">counts</span>[<span class="ruby-identifier">classname</span>] = [<span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span>, <span class="ruby-value">0</span>].<span class="ruby-identifier">max</span>
|
346
368
|
}
|
347
369
|
|
348
370
|
<span class="ruby-identifier">counts</span>
|
@@ -352,21 +374,21 @@
|
|
352
374
|
</div>
|
353
375
|
</div>
|
354
376
|
|
355
|
-
<div id="method-
|
356
|
-
<a name="
|
377
|
+
<div id="method-M000037" class="method-detail">
|
378
|
+
<a name="M000037"></a>
|
357
379
|
|
358
380
|
<div class="method-heading">
|
359
|
-
<a href="#
|
381
|
+
<a href="#M000037" class="method-signature">
|
360
382
|
<span class="method-name">incr_doc_count</span><span class="method-args">(klass, count)</span>
|
361
383
|
</a>
|
362
384
|
</div>
|
363
385
|
|
364
386
|
<div class="method-description">
|
365
387
|
<p><a class="source-toggle" href="#"
|
366
|
-
onclick="toggleCode('
|
367
|
-
<div class="method-source-code" id="
|
388
|
+
onclick="toggleCode('M000037-source');return false;">[Source]</a></p>
|
389
|
+
<div class="method-source-code" id="M000037-source">
|
368
390
|
<pre>
|
369
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
391
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 95</span>
|
370
392
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_doc_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
|
371
393
|
<span class="ruby-ivar">@klass_doc_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:doccount"</span>, <span class="ruby-identifier">count</span>
|
372
394
|
<span class="ruby-keyword kw">end</span>
|
@@ -375,21 +397,21 @@
|
|
375
397
|
</div>
|
376
398
|
</div>
|
377
399
|
|
378
|
-
<div id="method-
|
379
|
-
<a name="
|
400
|
+
<div id="method-M000036" class="method-detail">
|
401
|
+
<a name="M000036"></a>
|
380
402
|
|
381
403
|
<div class="method-heading">
|
382
|
-
<a href="#
|
404
|
+
<a href="#M000036" class="method-signature">
|
383
405
|
<span class="method-name">incr_total_word_count</span><span class="method-args">(klass, count)</span>
|
384
406
|
</a>
|
385
407
|
</div>
|
386
408
|
|
387
409
|
<div class="method-description">
|
388
410
|
<p><a class="source-toggle" href="#"
|
389
|
-
onclick="toggleCode('
|
390
|
-
<div class="method-source-code" id="
|
411
|
+
onclick="toggleCode('M000036-source');return false;">[Source]</a></p>
|
412
|
+
<div class="method-source-code" id="M000036-source">
|
391
413
|
<pre>
|
392
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
414
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 91</span>
|
393
415
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_total_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">count</span>)
|
394
416
|
<span class="ruby-ivar">@klass_word_counts</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:wordcount"</span>, <span class="ruby-identifier">count</span>
|
395
417
|
<span class="ruby-keyword kw">end</span>
|
@@ -398,42 +420,50 @@
|
|
398
420
|
</div>
|
399
421
|
</div>
|
400
422
|
|
401
|
-
<div id="method-
|
402
|
-
<a name="
|
423
|
+
<div id="method-M000035" class="method-detail">
|
424
|
+
<a name="M000035"></a>
|
403
425
|
|
404
426
|
<div class="method-heading">
|
405
|
-
<a href="#
|
427
|
+
<a href="#M000035" class="method-signature">
|
406
428
|
<span class="method-name">incr_word_count</span><span class="method-args">(klass, word, count)</span>
|
407
429
|
</a>
|
408
430
|
</div>
|
409
431
|
|
410
432
|
<div class="method-description">
|
411
433
|
<p><a class="source-toggle" href="#"
|
412
|
-
onclick="toggleCode('
|
413
|
-
<div class="method-source-code" id="
|
434
|
+
onclick="toggleCode('M000035-source');return false;">[Source]</a></p>
|
435
|
+
<div class="method-source-code" id="M000035-source">
|
414
436
|
<pre>
|
415
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
437
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 79</span>
|
416
438
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">incr_word_count</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>)
|
417
|
-
<span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">"classes:#{klass.to_s}"</span>, <span class="ruby-identifier">count</span>
|
439
|
+
<span class="ruby-identifier">size</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">"classes:#{klass.to_s}"</span>, <span class="ruby-identifier">count</span>
|
440
|
+
<span class="ruby-comment cmt"># if this is a new word, increase the klass's vocab size. If the new word</span>
|
441
|
+
<span class="ruby-comment cmt"># count is 0, then we need to decrement our vocab size</span>
|
442
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">count</span>
|
443
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:vocabsize"</span>
|
444
|
+
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
|
445
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">"totals:vocabsize"</span>, <span class="ruby-value">-1</span>
|
446
|
+
<span class="ruby-keyword kw">end</span>
|
447
|
+
<span class="ruby-identifier">size</span>
|
418
448
|
<span class="ruby-keyword kw">end</span>
|
419
449
|
</pre>
|
420
450
|
</div>
|
421
451
|
</div>
|
422
452
|
</div>
|
423
453
|
|
424
|
-
<div id="method-
|
425
|
-
<a name="
|
454
|
+
<div id="method-M000030" class="method-detail">
|
455
|
+
<a name="M000030"></a>
|
426
456
|
|
427
457
|
<div class="method-heading">
|
428
|
-
<a href="#
|
458
|
+
<a href="#M000030" class="method-signature">
|
429
459
|
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
430
460
|
</a>
|
431
461
|
</div>
|
432
462
|
|
433
463
|
<div class="method-description">
|
434
464
|
<p><a class="source-toggle" href="#"
|
435
|
-
onclick="toggleCode('
|
436
|
-
<div class="method-source-code" id="
|
465
|
+
onclick="toggleCode('M000030-source');return false;">[Source]</a></p>
|
466
|
+
<div class="method-source-code" id="M000030-source">
|
437
467
|
<pre>
|
438
468
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 39</span>
|
439
469
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
@@ -450,19 +480,19 @@
|
|
450
480
|
</div>
|
451
481
|
</div>
|
452
482
|
|
453
|
-
<div id="method-
|
454
|
-
<a name="
|
483
|
+
<div id="method-M000028" class="method-detail">
|
484
|
+
<a name="M000028"></a>
|
455
485
|
|
456
486
|
<div class="method-heading">
|
457
|
-
<a href="#
|
487
|
+
<a href="#M000028" class="method-signature">
|
458
488
|
<span class="method-name">reset</span><span class="method-args">()</span>
|
459
489
|
</a>
|
460
490
|
</div>
|
461
491
|
|
462
492
|
<div class="method-description">
|
463
493
|
<p><a class="source-toggle" href="#"
|
464
|
-
onclick="toggleCode('
|
465
|
-
<div class="method-source-code" id="
|
494
|
+
onclick="toggleCode('M000028-source');return false;">[Source]</a></p>
|
495
|
+
<div class="method-source-code" id="M000028-source">
|
466
496
|
<pre>
|
467
497
|
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 25</span>
|
468
498
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
@@ -476,21 +506,21 @@
|
|
476
506
|
|
477
507
|
<h3 class="section-bar">Protected Instance methods</h3>
|
478
508
|
|
479
|
-
<div id="method-
|
480
|
-
<a name="
|
509
|
+
<div id="method-M000042" class="method-detail">
|
510
|
+
<a name="M000042"></a>
|
481
511
|
|
482
512
|
<div class="method-heading">
|
483
|
-
<a href="#
|
513
|
+
<a href="#M000042" class="method-signature">
|
484
514
|
<span class="method-name">freq_table</span><span class="method-args">()</span>
|
485
515
|
</a>
|
486
516
|
</div>
|
487
517
|
|
488
518
|
<div class="method-description">
|
489
519
|
<p><a class="source-toggle" href="#"
|
490
|
-
onclick="toggleCode('
|
491
|
-
<div class="method-source-code" id="
|
520
|
+
onclick="toggleCode('M000042-source');return false;">[Source]</a></p>
|
521
|
+
<div class="method-source-code" id="M000042-source">
|
492
522
|
<pre>
|
493
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
523
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 120</span>
|
494
524
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
|
495
525
|
<span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
|
496
526
|
<span class="ruby-keyword kw">end</span>
|
@@ -499,21 +529,48 @@
|
|
499
529
|
</div>
|
500
530
|
</div>
|
501
531
|
|
502
|
-
<div id="method-
|
503
|
-
<a name="
|
532
|
+
<div id="method-M000040" class="method-detail">
|
533
|
+
<a name="M000040"></a>
|
504
534
|
|
505
535
|
<div class="method-heading">
|
506
|
-
<a href="#
|
536
|
+
<a href="#M000040" class="method-signature">
|
537
|
+
<span class="method-name">get_summary</span><span class="method-args">(name)</span>
|
538
|
+
</a>
|
539
|
+
</div>
|
540
|
+
|
541
|
+
<div class="method-description">
|
542
|
+
<p><a class="source-toggle" href="#"
|
543
|
+
onclick="toggleCode('M000040-source');return false;">[Source]</a></p>
|
544
|
+
<div class="method-source-code" id="M000040-source">
|
545
|
+
<pre>
|
546
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 108</span>
|
547
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_summary</span>(<span class="ruby-identifier">name</span>)
|
548
|
+
<span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
549
|
+
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-identifier">name</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
550
|
+
<span class="ruby-identifier">counts</span>[<span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>] = <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-identifier">name</span>].<span class="ruby-identifier">to_i64</span>
|
551
|
+
}
|
552
|
+
<span class="ruby-identifier">counts</span>
|
553
|
+
<span class="ruby-keyword kw">end</span>
|
554
|
+
</pre>
|
555
|
+
</div>
|
556
|
+
</div>
|
557
|
+
</div>
|
558
|
+
|
559
|
+
<div id="method-M000041" class="method-detail">
|
560
|
+
<a name="M000041"></a>
|
561
|
+
|
562
|
+
<div class="method-heading">
|
563
|
+
<a href="#M000041" class="method-signature">
|
507
564
|
<span class="method-name">summary_table</span><span class="method-args">()</span>
|
508
565
|
</a>
|
509
566
|
</div>
|
510
567
|
|
511
568
|
<div class="method-description">
|
512
569
|
<p><a class="source-toggle" href="#"
|
513
|
-
onclick="toggleCode('
|
514
|
-
<div class="method-source-code" id="
|
570
|
+
onclick="toggleCode('M000041-source');return false;">[Source]</a></p>
|
571
|
+
<div class="method-source-code" id="M000041-source">
|
515
572
|
<pre>
|
516
|
-
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line
|
573
|
+
<span class="ruby-comment cmt"># File lib/ankusa/hbase_storage.rb, line 116</span>
|
517
574
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
|
518
575
|
<span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
|
519
576
|
<span class="ruby-keyword kw">end</span>
|