ankusa 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +6 -4
- data/Rakefile +2 -2
- data/docs/classes/Ankusa.html +8 -8
- data/docs/classes/Ankusa/Classifier.html +82 -271
- data/docs/classes/Ankusa/HBaseStorage.html +537 -0
- data/docs/classes/Ankusa/MemoryStorage.html +439 -0
- data/docs/classes/Ankusa/TextHash.html +84 -29
- data/docs/classes/String.html +172 -0
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +6 -4
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/extensions_rb.html +108 -0
- data/docs/files/lib/ankusa/hasher_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +108 -0
- data/docs/files/lib/ankusa/{nbclass_rb.html → memory_storage_rb.html} +4 -4
- data/docs/files/lib/ankusa_rb.html +4 -2
- data/docs/fr_class_index.html +3 -1
- data/docs/fr_file_index.html +3 -1
- data/docs/fr_method_index.html +41 -17
- data/lib/ankusa.rb +3 -1
- data/lib/ankusa/classifier.rb +37 -86
- data/lib/ankusa/extensions.rb +13 -0
- data/lib/ankusa/hasher.rb +24 -10
- data/lib/ankusa/hbase_storage.rb +109 -0
- data/lib/ankusa/memory_storage.rb +61 -0
- metadata +13 -7
- data/docs/classes/Ankusa/NBClass.html +0 -168
- data/lib/ankusa/nbclass.rb +0 -15
data/README.rdoc
CHANGED
@@ -10,12 +10,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
10
10
|
== Basic Usage
|
11
11
|
require 'rubygems'
|
12
12
|
require 'ankusa'
|
13
|
-
require 'hbaserb'
|
14
13
|
|
15
14
|
# connect to HBase
|
16
|
-
|
15
|
+
storage = Ankusa::HBaseStorage.new 'localhost'
|
16
|
+
c = Ankusa::Classifier.new storage
|
17
17
|
|
18
|
-
c = Classifier.new client
|
19
18
|
c.train :spam, "This is some spammy text"
|
20
19
|
c.train :good, "This is not the bad stuff"
|
21
20
|
|
@@ -27,4 +26,7 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
27
26
|
puts c.classifications "This is some spammy text"
|
28
27
|
|
29
28
|
# get a list of all classes
|
30
|
-
puts c.classes
|
29
|
+
puts c.classes
|
30
|
+
|
31
|
+
# close connection
|
32
|
+
storage.close
|
data/Rakefile
CHANGED
@@ -22,9 +22,9 @@ Rake::TestTask.new("test") { |t|
|
|
22
22
|
|
23
23
|
spec = Gem::Specification.new do |s|
|
24
24
|
s.name = "ankusa"
|
25
|
-
s.version = "0.0.
|
25
|
+
s.version = "0.0.3"
|
26
26
|
s.authors = ["Brian Muller"]
|
27
|
-
s.date = %q{2010-
|
27
|
+
s.date = %q{2010-12-02}
|
28
28
|
s.description = "Naive Bayes classifier with HBase storage"
|
29
29
|
s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
|
30
30
|
s.email = "brian.muller@livingsocial.com"
|
data/docs/classes/Ankusa.html
CHANGED
@@ -63,8 +63,12 @@
|
|
63
63
|
lib/ankusa/hasher.rb
|
64
64
|
</a>
|
65
65
|
<br />
|
66
|
-
<a href="../files/lib/ankusa/
|
67
|
-
lib/ankusa/
|
66
|
+
<a href="../files/lib/ankusa/hbase_storage_rb.html">
|
67
|
+
lib/ankusa/hbase_storage.rb
|
68
|
+
</a>
|
69
|
+
<br />
|
70
|
+
<a href="../files/lib/ankusa/memory_storage_rb.html">
|
71
|
+
lib/ankusa/memory_storage.rb
|
68
72
|
</a>
|
69
73
|
<br />
|
70
74
|
<a href="../files/lib/ankusa/stopwords_rb.html">
|
@@ -100,7 +104,8 @@
|
|
100
104
|
<h3 class="section-bar">Classes and Modules</h3>
|
101
105
|
|
102
106
|
Class <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
|
103
|
-
Class <a href="Ankusa/
|
107
|
+
Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
|
108
|
+
Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
|
104
109
|
Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
|
105
110
|
|
106
111
|
</div>
|
@@ -110,11 +115,6 @@ Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
|
|
110
115
|
|
111
116
|
<div class="name-list">
|
112
117
|
<table summary="Constants">
|
113
|
-
<tr class="top-aligned-row context-row">
|
114
|
-
<td class="context-item-name">SMALL_PROB</td>
|
115
|
-
<td>=</td>
|
116
|
-
<td class="context-item-value">0.0001</td>
|
117
|
-
</tr>
|
118
118
|
<tr class="top-aligned-row context-row">
|
119
119
|
<td class="context-item-name">STOPWORDS</td>
|
120
120
|
<td>=</td>
|
@@ -86,19 +86,12 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
95
|
-
<a href="#M000011">init_tables</a>
|
96
|
-
<a href="#M000001">new</a>
|
97
|
-
<a href="#M000006">refresh_classnames</a>
|
98
|
-
<a href="#M000008">reset</a>
|
99
|
-
<a href="#M000012">summary_table</a>
|
100
|
-
<a href="#M000002">train</a>
|
101
|
-
<a href="#M000003">untrain</a>
|
89
|
+
<a href="#M000007">classifications</a>
|
90
|
+
<a href="#M000006">classify</a>
|
91
|
+
<a href="#M000008">get_word_probs</a>
|
92
|
+
<a href="#M000003">new</a>
|
93
|
+
<a href="#M000004">train</a>
|
94
|
+
<a href="#M000005">untrain</a>
|
102
95
|
</div>
|
103
96
|
</div>
|
104
97
|
|
@@ -133,27 +126,25 @@
|
|
133
126
|
<div id="methods">
|
134
127
|
<h3 class="section-bar">Public Class methods</h3>
|
135
128
|
|
136
|
-
<div id="method-
|
137
|
-
<a name="
|
129
|
+
<div id="method-M000003" class="method-detail">
|
130
|
+
<a name="M000003"></a>
|
138
131
|
|
139
132
|
<div class="method-heading">
|
140
|
-
<a href="#
|
141
|
-
<span class="method-name">new</span><span class="method-args">(
|
133
|
+
<a href="#M000003" class="method-signature">
|
134
|
+
<span class="method-name">new</span><span class="method-args">(storage)</span>
|
142
135
|
</a>
|
143
136
|
</div>
|
144
137
|
|
145
138
|
<div class="method-description">
|
146
139
|
<p><a class="source-toggle" href="#"
|
147
|
-
onclick="toggleCode('
|
148
|
-
<div class="method-source-code" id="
|
140
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
141
|
+
<div class="method-source-code" id="M000003-source">
|
149
142
|
<pre>
|
150
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
151
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">
|
152
|
-
<span class="ruby-ivar">@
|
153
|
-
<span class="ruby-ivar">@
|
154
|
-
<span class="ruby-ivar">@
|
155
|
-
<span class="ruby-identifier">init_tables</span>
|
156
|
-
<span class="ruby-ivar">@classnames</span> = <span class="ruby-identifier">refresh_classnames</span>
|
143
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 6</span>
|
144
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">storage</span>)
|
145
|
+
<span class="ruby-ivar">@storage</span> = <span class="ruby-identifier">storage</span>
|
146
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">init_tables</span>
|
147
|
+
<span class="ruby-ivar">@classnames</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">classnames</span>
|
157
148
|
<span class="ruby-keyword kw">end</span>
|
158
149
|
</pre>
|
159
150
|
</div>
|
@@ -162,120 +153,39 @@
|
|
162
153
|
|
163
154
|
<h3 class="section-bar">Public Instance methods</h3>
|
164
155
|
|
165
|
-
<div id="method-
|
166
|
-
<a name="
|
156
|
+
<div id="method-M000007" class="method-detail">
|
157
|
+
<a name="M000007"></a>
|
167
158
|
|
168
159
|
<div class="method-heading">
|
169
|
-
<a href="#
|
160
|
+
<a href="#M000007" class="method-signature">
|
170
161
|
<span class="method-name">classifications</span><span class="method-args">(text)</span>
|
171
162
|
</a>
|
172
163
|
</div>
|
173
164
|
|
174
165
|
<div class="method-description">
|
175
166
|
<p><a class="source-toggle" href="#"
|
176
|
-
onclick="toggleCode('
|
177
|
-
<div class="method-source-code" id="
|
167
|
+
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
168
|
+
<div class="method-source-code" id="M000007-source">
|
178
169
|
<pre>
|
179
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
170
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 44</span>
|
180
171
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
|
181
|
-
<span class="ruby-identifier">
|
182
|
-
<span class="ruby-identifier">result</span> = {}
|
183
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
184
|
-
<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">NBClass</span>.<span class="ruby-identifier">new</span> <span class="ruby-identifier">k</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>
|
185
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span>
|
186
|
-
}
|
172
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
187
173
|
|
188
|
-
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span
|
189
|
-
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span
|
190
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) }
|
174
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
175
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
176
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
191
177
|
}
|
192
|
-
|
193
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>].<span class="ruby-identifier">doc_count</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) }
|
194
|
-
|
195
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) }
|
196
|
-
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
197
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">klass</span><span class="ruby-operator">|</span>
|
198
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span>
|
199
|
-
}
|
200
|
-
|
201
|
-
<span class="ruby-identifier">result</span>
|
202
|
-
<span class="ruby-keyword kw">end</span>
|
203
|
-
</pre>
|
204
|
-
</div>
|
205
|
-
</div>
|
206
|
-
</div>
|
207
|
-
|
208
|
-
<div id="method-M000004" class="method-detail">
|
209
|
-
<a name="M000004"></a>
|
210
|
-
|
211
|
-
<div class="method-heading">
|
212
|
-
<a href="#M000004" class="method-signature">
|
213
|
-
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
214
|
-
</a>
|
215
|
-
</div>
|
216
|
-
|
217
|
-
<div class="method-description">
|
218
|
-
<p><a class="source-toggle" href="#"
|
219
|
-
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
220
|
-
<div class="method-source-code" id="M000004-source">
|
221
|
-
<pre>
|
222
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 34</span>
|
223
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
224
|
-
<span class="ruby-comment cmt"># return the most probable class</span>
|
225
|
-
<span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">o</span>,<span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">o</span>[<span class="ruby-value">1</span>] <span class="ruby-operator"><=></span> <span class="ruby-identifier">t</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
226
|
-
<span class="ruby-keyword kw">end</span>
|
227
|
-
</pre>
|
228
|
-
</div>
|
229
|
-
</div>
|
230
|
-
</div>
|
231
|
-
|
232
|
-
<div id="method-M000009" class="method-detail">
|
233
|
-
<a name="M000009"></a>
|
234
178
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
</div>
|
240
|
-
|
241
|
-
<div class="method-description">
|
242
|
-
<p><a class="source-toggle" href="#"
|
243
|
-
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
244
|
-
<div class="method-source-code" id="M000009-source">
|
245
|
-
<pre>
|
246
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 84</span>
|
247
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
|
248
|
-
<span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
|
249
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-value str">"totals:doccount"</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
250
|
-
<span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">"totals:doccount"</span>].<span class="ruby-identifier">to_i64</span>
|
179
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
180
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
181
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
|
182
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
|
251
183
|
}
|
252
|
-
<span class="ruby-identifier">total</span>
|
253
|
-
<span class="ruby-keyword kw">end</span>
|
254
|
-
</pre>
|
255
|
-
</div>
|
256
|
-
</div>
|
257
|
-
</div>
|
258
|
-
|
259
|
-
<div id="method-M000007" class="method-detail">
|
260
|
-
<a name="M000007"></a>
|
261
|
-
|
262
|
-
<div class="method-heading">
|
263
|
-
<a href="#M000007" class="method-signature">
|
264
|
-
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
265
|
-
</a>
|
266
|
-
</div>
|
267
184
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
<pre>
|
273
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 72</span>
|
274
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
275
|
-
<span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">delete</span>
|
276
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">delete</span>
|
277
|
-
<span class="ruby-ivar">@stable</span> = <span class="ruby-keyword kw">nil</span>
|
278
|
-
<span class="ruby-ivar">@ftable</span> = <span class="ruby-keyword kw">nil</span>
|
185
|
+
<span class="ruby-comment cmt"># normalize to get probs</span>
|
186
|
+
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
187
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
188
|
+
<span class="ruby-identifier">result</span>
|
279
189
|
<span class="ruby-keyword kw">end</span>
|
280
190
|
</pre>
|
281
191
|
</div>
|
@@ -287,77 +197,52 @@
|
|
287
197
|
|
288
198
|
<div class="method-heading">
|
289
199
|
<a href="#M000006" class="method-signature">
|
290
|
-
<span class="method-name">
|
200
|
+
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
291
201
|
</a>
|
292
202
|
</div>
|
293
203
|
|
294
204
|
<div class="method-description">
|
295
|
-
<p>
|
296
|
-
get all classes
|
297
|
-
</p>
|
298
205
|
<p><a class="source-toggle" href="#"
|
299
206
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
300
207
|
<div class="method-source-code" id="M000006-source">
|
301
208
|
<pre>
|
302
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
303
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
304
|
-
<span class="ruby-
|
305
|
-
<span class="ruby-identifier">
|
306
|
-
<span class="ruby-identifier">cs</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>
|
307
|
-
}
|
308
|
-
<span class="ruby-identifier">cs</span>
|
309
|
-
<span class="ruby-keyword kw">end</span>
|
310
|
-
</pre>
|
311
|
-
</div>
|
312
|
-
</div>
|
313
|
-
</div>
|
314
|
-
|
315
|
-
<div id="method-M000008" class="method-detail">
|
316
|
-
<a name="M000008"></a>
|
317
|
-
|
318
|
-
<div class="method-heading">
|
319
|
-
<a href="#M000008" class="method-signature">
|
320
|
-
<span class="method-name">reset</span><span class="method-args">()</span>
|
321
|
-
</a>
|
322
|
-
</div>
|
323
|
-
|
324
|
-
<div class="method-description">
|
325
|
-
<p><a class="source-toggle" href="#"
|
326
|
-
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
327
|
-
<div class="method-source-code" id="M000008-source">
|
328
|
-
<pre>
|
329
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 79</span>
|
330
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
331
|
-
<span class="ruby-identifier">drop_tables</span>
|
332
|
-
<span class="ruby-identifier">init_tables</span>
|
209
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
|
210
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
211
|
+
<span class="ruby-comment cmt"># return the most probable class</span>
|
212
|
+
<span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
333
213
|
<span class="ruby-keyword kw">end</span>
|
334
214
|
</pre>
|
335
215
|
</div>
|
336
216
|
</div>
|
337
217
|
</div>
|
338
218
|
|
339
|
-
<div id="method-
|
340
|
-
<a name="
|
219
|
+
<div id="method-M000004" class="method-detail">
|
220
|
+
<a name="M000004"></a>
|
341
221
|
|
342
222
|
<div class="method-heading">
|
343
|
-
<a href="#
|
344
|
-
<span class="method-name">train</span><span class="method-args">(klass, text)</span>
|
223
|
+
<a href="#M000004" class="method-signature">
|
224
|
+
<span class="method-name">train</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
|
345
225
|
</a>
|
346
226
|
</div>
|
347
227
|
|
348
228
|
<div class="method-description">
|
229
|
+
<p>
|
230
|
+
text can be either an array of strings or a string klass is a symbol
|
231
|
+
</p>
|
349
232
|
<p><a class="source-toggle" href="#"
|
350
|
-
onclick="toggleCode('
|
351
|
-
<div class="method-source-code" id="
|
233
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
234
|
+
<div class="method-source-code" id="M000004-source">
|
352
235
|
<pre>
|
353
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
236
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 14</span>
|
354
237
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">train</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
355
238
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
356
239
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
357
|
-
<span class="ruby-
|
240
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>
|
241
|
+
<span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
|
358
242
|
}
|
359
|
-
<span class="ruby-
|
360
|
-
<span class="ruby-identifier">
|
243
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
244
|
+
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
245
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
|
361
246
|
<span class="ruby-ivar">@classnames</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
|
362
247
|
<span class="ruby-keyword kw">end</span>
|
363
248
|
</pre>
|
@@ -365,28 +250,33 @@ get all classes
|
|
365
250
|
</div>
|
366
251
|
</div>
|
367
252
|
|
368
|
-
<div id="method-
|
369
|
-
<a name="
|
253
|
+
<div id="method-M000005" class="method-detail">
|
254
|
+
<a name="M000005"></a>
|
370
255
|
|
371
256
|
<div class="method-heading">
|
372
|
-
<a href="#
|
373
|
-
<span class="method-name">untrain</span><span class="method-args">(klass, text)</span>
|
257
|
+
<a href="#M000005" class="method-signature">
|
258
|
+
<span class="method-name">untrain</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
|
374
259
|
</a>
|
375
260
|
</div>
|
376
261
|
|
377
262
|
<div class="method-description">
|
263
|
+
<p>
|
264
|
+
text can be either an array of strings or a string klass is a symbol
|
265
|
+
</p>
|
378
266
|
<p><a class="source-toggle" href="#"
|
379
|
-
onclick="toggleCode('
|
380
|
-
<div class="method-source-code" id="
|
267
|
+
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
268
|
+
<div class="method-source-code" id="M000005-source">
|
381
269
|
<pre>
|
382
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
270
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 28</span>
|
383
271
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
384
272
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
385
273
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
386
|
-
<span class="ruby-
|
274
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
|
275
|
+
<span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
|
387
276
|
}
|
388
|
-
<span class="ruby-
|
389
|
-
<span class="ruby-identifier">
|
277
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
278
|
+
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
279
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
|
390
280
|
<span class="ruby-keyword kw">end</span>
|
391
281
|
</pre>
|
392
282
|
</div>
|
@@ -395,53 +285,26 @@ get all classes
|
|
395
285
|
|
396
286
|
<h3 class="section-bar">Protected Instance methods</h3>
|
397
287
|
|
398
|
-
<div id="method-
|
399
|
-
<a name="
|
400
|
-
|
401
|
-
<div class="method-heading">
|
402
|
-
<a href="#M000013" class="method-signature">
|
403
|
-
<span class="method-name">freq_table</span><span class="method-args">()</span>
|
404
|
-
</a>
|
405
|
-
</div>
|
406
|
-
|
407
|
-
<div class="method-description">
|
408
|
-
<p><a class="source-toggle" href="#"
|
409
|
-
onclick="toggleCode('M000013-source');return false;">[Source]</a></p>
|
410
|
-
<div class="method-source-code" id="M000013-source">
|
411
|
-
<pre>
|
412
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 120</span>
|
413
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
|
414
|
-
<span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
|
415
|
-
<span class="ruby-keyword kw">end</span>
|
416
|
-
</pre>
|
417
|
-
</div>
|
418
|
-
</div>
|
419
|
-
</div>
|
420
|
-
|
421
|
-
<div id="method-M000010" class="method-detail">
|
422
|
-
<a name="M000010"></a>
|
288
|
+
<div id="method-M000008" class="method-detail">
|
289
|
+
<a name="M000008"></a>
|
423
290
|
|
424
291
|
<div class="method-heading">
|
425
|
-
<a href="#
|
426
|
-
<span class="method-name">get_word_probs</span><span class="method-args">(word
|
292
|
+
<a href="#M000008" class="method-signature">
|
293
|
+
<span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
|
427
294
|
</a>
|
428
295
|
</div>
|
429
296
|
|
430
297
|
<div class="method-description">
|
431
298
|
<p><a class="source-toggle" href="#"
|
432
|
-
onclick="toggleCode('
|
433
|
-
<div class="method-source-code" id="
|
299
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
300
|
+
<div class="method-source-code" id="M000008-source">
|
434
301
|
<pre>
|
435
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
436
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span
|
437
|
-
<span class="ruby-identifier">probs</span> =
|
438
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
<span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
|
443
|
-
<span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
|
444
|
-
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">classname</span>].<span class="ruby-identifier">word_count</span>
|
302
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 65</span>
|
303
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
304
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
|
305
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
306
|
+
<span class="ruby-comment cmt"># use a laplacian smoother</span>
|
307
|
+
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
|
445
308
|
}
|
446
309
|
<span class="ruby-identifier">probs</span>
|
447
310
|
<span class="ruby-keyword kw">end</span>
|
@@ -450,58 +313,6 @@ get all classes
|
|
450
313
|
</div>
|
451
314
|
</div>
|
452
315
|
|
453
|
-
<div id="method-M000011" class="method-detail">
|
454
|
-
<a name="M000011"></a>
|
455
|
-
|
456
|
-
<div class="method-heading">
|
457
|
-
<a href="#M000011" class="method-signature">
|
458
|
-
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
459
|
-
</a>
|
460
|
-
</div>
|
461
|
-
|
462
|
-
<div class="method-description">
|
463
|
-
<p><a class="source-toggle" href="#"
|
464
|
-
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
465
|
-
<div class="method-source-code" id="M000011-source">
|
466
|
-
<pre>
|
467
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 106</span>
|
468
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
469
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@ftablename</span>
|
470
|
-
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@ftablename</span>, <span class="ruby-value str">"classes"</span>, <span class="ruby-value str">"total"</span>
|
471
|
-
<span class="ruby-keyword kw">end</span>
|
472
|
-
|
473
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@stablename</span>
|
474
|
-
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@stablename</span>, <span class="ruby-value str">"totals"</span>
|
475
|
-
<span class="ruby-keyword kw">end</span>
|
476
|
-
<span class="ruby-keyword kw">end</span>
|
477
|
-
</pre>
|
478
|
-
</div>
|
479
|
-
</div>
|
480
|
-
</div>
|
481
|
-
|
482
|
-
<div id="method-M000012" class="method-detail">
|
483
|
-
<a name="M000012"></a>
|
484
|
-
|
485
|
-
<div class="method-heading">
|
486
|
-
<a href="#M000012" class="method-signature">
|
487
|
-
<span class="method-name">summary_table</span><span class="method-args">()</span>
|
488
|
-
</a>
|
489
|
-
</div>
|
490
|
-
|
491
|
-
<div class="method-description">
|
492
|
-
<p><a class="source-toggle" href="#"
|
493
|
-
onclick="toggleCode('M000012-source');return false;">[Source]</a></p>
|
494
|
-
<div class="method-source-code" id="M000012-source">
|
495
|
-
<pre>
|
496
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 116</span>
|
497
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
|
498
|
-
<span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
|
499
|
-
<span class="ruby-keyword kw">end</span>
|
500
|
-
</pre>
|
501
|
-
</div>
|
502
|
-
</div>
|
503
|
-
</div>
|
504
|
-
|
505
316
|
|
506
317
|
</div>
|
507
318
|
|