ankusa 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +6 -4
- data/Rakefile +2 -2
- data/docs/classes/Ankusa.html +8 -8
- data/docs/classes/Ankusa/Classifier.html +82 -271
- data/docs/classes/Ankusa/HBaseStorage.html +537 -0
- data/docs/classes/Ankusa/MemoryStorage.html +439 -0
- data/docs/classes/Ankusa/TextHash.html +84 -29
- data/docs/classes/String.html +172 -0
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +6 -4
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/extensions_rb.html +108 -0
- data/docs/files/lib/ankusa/hasher_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +108 -0
- data/docs/files/lib/ankusa/{nbclass_rb.html → memory_storage_rb.html} +4 -4
- data/docs/files/lib/ankusa_rb.html +4 -2
- data/docs/fr_class_index.html +3 -1
- data/docs/fr_file_index.html +3 -1
- data/docs/fr_method_index.html +41 -17
- data/lib/ankusa.rb +3 -1
- data/lib/ankusa/classifier.rb +37 -86
- data/lib/ankusa/extensions.rb +13 -0
- data/lib/ankusa/hasher.rb +24 -10
- data/lib/ankusa/hbase_storage.rb +109 -0
- data/lib/ankusa/memory_storage.rb +61 -0
- metadata +13 -7
- data/docs/classes/Ankusa/NBClass.html +0 -168
- data/lib/ankusa/nbclass.rb +0 -15
data/README.rdoc
CHANGED
@@ -10,12 +10,11 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
10
10
|
== Basic Usage
|
11
11
|
require 'rubygems'
|
12
12
|
require 'ankusa'
|
13
|
-
require 'hbaserb'
|
14
13
|
|
15
14
|
# connect to HBase
|
16
|
-
|
15
|
+
storage = Ankusa::HBaseStorage.new 'localhost'
|
16
|
+
c = Ankusa::Classifier.new storage
|
17
17
|
|
18
|
-
c = Classifier.new client
|
19
18
|
c.train :spam, "This is some spammy text"
|
20
19
|
c.train :good, "This is not the bad stuff"
|
21
20
|
|
@@ -27,4 +26,7 @@ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been st
|
|
27
26
|
puts c.classifications "This is some spammy text"
|
28
27
|
|
29
28
|
# get a list of all classes
|
30
|
-
puts c.classes
|
29
|
+
puts c.classes
|
30
|
+
|
31
|
+
# close connection
|
32
|
+
storage.close
|
data/Rakefile
CHANGED
@@ -22,9 +22,9 @@ Rake::TestTask.new("test") { |t|
|
|
22
22
|
|
23
23
|
spec = Gem::Specification.new do |s|
|
24
24
|
s.name = "ankusa"
|
25
|
-
s.version = "0.0.
|
25
|
+
s.version = "0.0.3"
|
26
26
|
s.authors = ["Brian Muller"]
|
27
|
-
s.date = %q{2010-
|
27
|
+
s.date = %q{2010-12-02}
|
28
28
|
s.description = "Naive Bayes classifier with HBase storage"
|
29
29
|
s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
|
30
30
|
s.email = "brian.muller@livingsocial.com"
|
data/docs/classes/Ankusa.html
CHANGED
@@ -63,8 +63,12 @@
|
|
63
63
|
lib/ankusa/hasher.rb
|
64
64
|
</a>
|
65
65
|
<br />
|
66
|
-
<a href="../files/lib/ankusa/
|
67
|
-
lib/ankusa/
|
66
|
+
<a href="../files/lib/ankusa/hbase_storage_rb.html">
|
67
|
+
lib/ankusa/hbase_storage.rb
|
68
|
+
</a>
|
69
|
+
<br />
|
70
|
+
<a href="../files/lib/ankusa/memory_storage_rb.html">
|
71
|
+
lib/ankusa/memory_storage.rb
|
68
72
|
</a>
|
69
73
|
<br />
|
70
74
|
<a href="../files/lib/ankusa/stopwords_rb.html">
|
@@ -100,7 +104,8 @@
|
|
100
104
|
<h3 class="section-bar">Classes and Modules</h3>
|
101
105
|
|
102
106
|
Class <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
|
103
|
-
Class <a href="Ankusa/
|
107
|
+
Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
|
108
|
+
Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
|
104
109
|
Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
|
105
110
|
|
106
111
|
</div>
|
@@ -110,11 +115,6 @@ Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
|
|
110
115
|
|
111
116
|
<div class="name-list">
|
112
117
|
<table summary="Constants">
|
113
|
-
<tr class="top-aligned-row context-row">
|
114
|
-
<td class="context-item-name">SMALL_PROB</td>
|
115
|
-
<td>=</td>
|
116
|
-
<td class="context-item-value">0.0001</td>
|
117
|
-
</tr>
|
118
118
|
<tr class="top-aligned-row context-row">
|
119
119
|
<td class="context-item-name">STOPWORDS</td>
|
120
120
|
<td>=</td>
|
@@ -86,19 +86,12 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
95
|
-
<a href="#M000011">init_tables</a>
|
96
|
-
<a href="#M000001">new</a>
|
97
|
-
<a href="#M000006">refresh_classnames</a>
|
98
|
-
<a href="#M000008">reset</a>
|
99
|
-
<a href="#M000012">summary_table</a>
|
100
|
-
<a href="#M000002">train</a>
|
101
|
-
<a href="#M000003">untrain</a>
|
89
|
+
<a href="#M000007">classifications</a>
|
90
|
+
<a href="#M000006">classify</a>
|
91
|
+
<a href="#M000008">get_word_probs</a>
|
92
|
+
<a href="#M000003">new</a>
|
93
|
+
<a href="#M000004">train</a>
|
94
|
+
<a href="#M000005">untrain</a>
|
102
95
|
</div>
|
103
96
|
</div>
|
104
97
|
|
@@ -133,27 +126,25 @@
|
|
133
126
|
<div id="methods">
|
134
127
|
<h3 class="section-bar">Public Class methods</h3>
|
135
128
|
|
136
|
-
<div id="method-
|
137
|
-
<a name="
|
129
|
+
<div id="method-M000003" class="method-detail">
|
130
|
+
<a name="M000003"></a>
|
138
131
|
|
139
132
|
<div class="method-heading">
|
140
|
-
<a href="#
|
141
|
-
<span class="method-name">new</span><span class="method-args">(
|
133
|
+
<a href="#M000003" class="method-signature">
|
134
|
+
<span class="method-name">new</span><span class="method-args">(storage)</span>
|
142
135
|
</a>
|
143
136
|
</div>
|
144
137
|
|
145
138
|
<div class="method-description">
|
146
139
|
<p><a class="source-toggle" href="#"
|
147
|
-
onclick="toggleCode('
|
148
|
-
<div class="method-source-code" id="
|
140
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
141
|
+
<div class="method-source-code" id="M000003-source">
|
149
142
|
<pre>
|
150
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
151
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">
|
152
|
-
<span class="ruby-ivar">@
|
153
|
-
<span class="ruby-ivar">@
|
154
|
-
<span class="ruby-ivar">@
|
155
|
-
<span class="ruby-identifier">init_tables</span>
|
156
|
-
<span class="ruby-ivar">@classnames</span> = <span class="ruby-identifier">refresh_classnames</span>
|
143
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 6</span>
|
144
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">storage</span>)
|
145
|
+
<span class="ruby-ivar">@storage</span> = <span class="ruby-identifier">storage</span>
|
146
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">init_tables</span>
|
147
|
+
<span class="ruby-ivar">@classnames</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">classnames</span>
|
157
148
|
<span class="ruby-keyword kw">end</span>
|
158
149
|
</pre>
|
159
150
|
</div>
|
@@ -162,120 +153,39 @@
|
|
162
153
|
|
163
154
|
<h3 class="section-bar">Public Instance methods</h3>
|
164
155
|
|
165
|
-
<div id="method-
|
166
|
-
<a name="
|
156
|
+
<div id="method-M000007" class="method-detail">
|
157
|
+
<a name="M000007"></a>
|
167
158
|
|
168
159
|
<div class="method-heading">
|
169
|
-
<a href="#
|
160
|
+
<a href="#M000007" class="method-signature">
|
170
161
|
<span class="method-name">classifications</span><span class="method-args">(text)</span>
|
171
162
|
</a>
|
172
163
|
</div>
|
173
164
|
|
174
165
|
<div class="method-description">
|
175
166
|
<p><a class="source-toggle" href="#"
|
176
|
-
onclick="toggleCode('
|
177
|
-
<div class="method-source-code" id="
|
167
|
+
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
168
|
+
<div class="method-source-code" id="M000007-source">
|
178
169
|
<pre>
|
179
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
170
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 44</span>
|
180
171
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
|
181
|
-
<span class="ruby-identifier">
|
182
|
-
<span class="ruby-identifier">result</span> = {}
|
183
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
184
|
-
<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">NBClass</span>.<span class="ruby-identifier">new</span> <span class="ruby-identifier">k</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>
|
185
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span>
|
186
|
-
}
|
172
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
187
173
|
|
188
|
-
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span
|
189
|
-
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span
|
190
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) }
|
174
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
175
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
176
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
191
177
|
}
|
192
|
-
|
193
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>].<span class="ruby-identifier">doc_count</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) }
|
194
|
-
|
195
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) }
|
196
|
-
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
197
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">klass</span><span class="ruby-operator">|</span>
|
198
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span>
|
199
|
-
}
|
200
|
-
|
201
|
-
<span class="ruby-identifier">result</span>
|
202
|
-
<span class="ruby-keyword kw">end</span>
|
203
|
-
</pre>
|
204
|
-
</div>
|
205
|
-
</div>
|
206
|
-
</div>
|
207
|
-
|
208
|
-
<div id="method-M000004" class="method-detail">
|
209
|
-
<a name="M000004"></a>
|
210
|
-
|
211
|
-
<div class="method-heading">
|
212
|
-
<a href="#M000004" class="method-signature">
|
213
|
-
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
214
|
-
</a>
|
215
|
-
</div>
|
216
|
-
|
217
|
-
<div class="method-description">
|
218
|
-
<p><a class="source-toggle" href="#"
|
219
|
-
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
220
|
-
<div class="method-source-code" id="M000004-source">
|
221
|
-
<pre>
|
222
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 34</span>
|
223
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
224
|
-
<span class="ruby-comment cmt"># return the most probable class</span>
|
225
|
-
<span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">o</span>,<span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">o</span>[<span class="ruby-value">1</span>] <span class="ruby-operator"><=></span> <span class="ruby-identifier">t</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
226
|
-
<span class="ruby-keyword kw">end</span>
|
227
|
-
</pre>
|
228
|
-
</div>
|
229
|
-
</div>
|
230
|
-
</div>
|
231
|
-
|
232
|
-
<div id="method-M000009" class="method-detail">
|
233
|
-
<a name="M000009"></a>
|
234
178
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
</div>
|
240
|
-
|
241
|
-
<div class="method-description">
|
242
|
-
<p><a class="source-toggle" href="#"
|
243
|
-
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
244
|
-
<div class="method-source-code" id="M000009-source">
|
245
|
-
<pre>
|
246
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 84</span>
|
247
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
|
248
|
-
<span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
|
249
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">""</span>, <span class="ruby-value str">"totals:doccount"</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
|
250
|
-
<span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">"totals:doccount"</span>].<span class="ruby-identifier">to_i64</span>
|
179
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
180
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
181
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
|
182
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
|
251
183
|
}
|
252
|
-
<span class="ruby-identifier">total</span>
|
253
|
-
<span class="ruby-keyword kw">end</span>
|
254
|
-
</pre>
|
255
|
-
</div>
|
256
|
-
</div>
|
257
|
-
</div>
|
258
|
-
|
259
|
-
<div id="method-M000007" class="method-detail">
|
260
|
-
<a name="M000007"></a>
|
261
|
-
|
262
|
-
<div class="method-heading">
|
263
|
-
<a href="#M000007" class="method-signature">
|
264
|
-
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
265
|
-
</a>
|
266
|
-
</div>
|
267
184
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
<pre>
|
273
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 72</span>
|
274
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
275
|
-
<span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">delete</span>
|
276
|
-
<span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">delete</span>
|
277
|
-
<span class="ruby-ivar">@stable</span> = <span class="ruby-keyword kw">nil</span>
|
278
|
-
<span class="ruby-ivar">@ftable</span> = <span class="ruby-keyword kw">nil</span>
|
185
|
+
<span class="ruby-comment cmt"># normalize to get probs</span>
|
186
|
+
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
187
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
188
|
+
<span class="ruby-identifier">result</span>
|
279
189
|
<span class="ruby-keyword kw">end</span>
|
280
190
|
</pre>
|
281
191
|
</div>
|
@@ -287,77 +197,52 @@
|
|
287
197
|
|
288
198
|
<div class="method-heading">
|
289
199
|
<a href="#M000006" class="method-signature">
|
290
|
-
<span class="method-name">
|
200
|
+
<span class="method-name">classify</span><span class="method-args">(text)</span>
|
291
201
|
</a>
|
292
202
|
</div>
|
293
203
|
|
294
204
|
<div class="method-description">
|
295
|
-
<p>
|
296
|
-
get all classes
|
297
|
-
</p>
|
298
205
|
<p><a class="source-toggle" href="#"
|
299
206
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
300
207
|
<div class="method-source-code" id="M000006-source">
|
301
208
|
<pre>
|
302
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
303
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">
|
304
|
-
<span class="ruby-
|
305
|
-
<span class="ruby-identifier">
|
306
|
-
<span class="ruby-identifier">cs</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>
|
307
|
-
}
|
308
|
-
<span class="ruby-identifier">cs</span>
|
309
|
-
<span class="ruby-keyword kw">end</span>
|
310
|
-
</pre>
|
311
|
-
</div>
|
312
|
-
</div>
|
313
|
-
</div>
|
314
|
-
|
315
|
-
<div id="method-M000008" class="method-detail">
|
316
|
-
<a name="M000008"></a>
|
317
|
-
|
318
|
-
<div class="method-heading">
|
319
|
-
<a href="#M000008" class="method-signature">
|
320
|
-
<span class="method-name">reset</span><span class="method-args">()</span>
|
321
|
-
</a>
|
322
|
-
</div>
|
323
|
-
|
324
|
-
<div class="method-description">
|
325
|
-
<p><a class="source-toggle" href="#"
|
326
|
-
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
327
|
-
<div class="method-source-code" id="M000008-source">
|
328
|
-
<pre>
|
329
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 79</span>
|
330
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
331
|
-
<span class="ruby-identifier">drop_tables</span>
|
332
|
-
<span class="ruby-identifier">init_tables</span>
|
209
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
|
210
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
|
211
|
+
<span class="ruby-comment cmt"># return the most probable class</span>
|
212
|
+
<span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
333
213
|
<span class="ruby-keyword kw">end</span>
|
334
214
|
</pre>
|
335
215
|
</div>
|
336
216
|
</div>
|
337
217
|
</div>
|
338
218
|
|
339
|
-
<div id="method-
|
340
|
-
<a name="
|
219
|
+
<div id="method-M000004" class="method-detail">
|
220
|
+
<a name="M000004"></a>
|
341
221
|
|
342
222
|
<div class="method-heading">
|
343
|
-
<a href="#
|
344
|
-
<span class="method-name">train</span><span class="method-args">(klass, text)</span>
|
223
|
+
<a href="#M000004" class="method-signature">
|
224
|
+
<span class="method-name">train</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
|
345
225
|
</a>
|
346
226
|
</div>
|
347
227
|
|
348
228
|
<div class="method-description">
|
229
|
+
<p>
|
230
|
+
text can be either an array of strings or a string klass is a symbol
|
231
|
+
</p>
|
349
232
|
<p><a class="source-toggle" href="#"
|
350
|
-
onclick="toggleCode('
|
351
|
-
<div class="method-source-code" id="
|
233
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
234
|
+
<div class="method-source-code" id="M000004-source">
|
352
235
|
<pre>
|
353
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
236
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 14</span>
|
354
237
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">train</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
355
238
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
356
239
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
357
|
-
<span class="ruby-
|
240
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span>
|
241
|
+
<span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
|
358
242
|
}
|
359
|
-
<span class="ruby-
|
360
|
-
<span class="ruby-identifier">
|
243
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
244
|
+
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
245
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
|
361
246
|
<span class="ruby-ivar">@classnames</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
|
362
247
|
<span class="ruby-keyword kw">end</span>
|
363
248
|
</pre>
|
@@ -365,28 +250,33 @@ get all classes
|
|
365
250
|
</div>
|
366
251
|
</div>
|
367
252
|
|
368
|
-
<div id="method-
|
369
|
-
<a name="
|
253
|
+
<div id="method-M000005" class="method-detail">
|
254
|
+
<a name="M000005"></a>
|
370
255
|
|
371
256
|
<div class="method-heading">
|
372
|
-
<a href="#
|
373
|
-
<span class="method-name">untrain</span><span class="method-args">(klass, text)</span>
|
257
|
+
<a href="#M000005" class="method-signature">
|
258
|
+
<span class="method-name">untrain</span><span class="method-args">(klass, text) {|word, count if block_given?| ...}</span>
|
374
259
|
</a>
|
375
260
|
</div>
|
376
261
|
|
377
262
|
<div class="method-description">
|
263
|
+
<p>
|
264
|
+
text can be either an array of strings or a string klass is a symbol
|
265
|
+
</p>
|
378
266
|
<p><a class="source-toggle" href="#"
|
379
|
-
onclick="toggleCode('
|
380
|
-
<div class="method-source-code" id="
|
267
|
+
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
268
|
+
<div class="method-source-code" id="M000005-source">
|
381
269
|
<pre>
|
382
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
270
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 28</span>
|
383
271
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
|
384
272
|
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
|
385
273
|
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
386
|
-
<span class="ruby-
|
274
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
|
275
|
+
<span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
|
387
276
|
}
|
388
|
-
<span class="ruby-
|
389
|
-
<span class="ruby-identifier">
|
277
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
|
278
|
+
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
|
279
|
+
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
|
390
280
|
<span class="ruby-keyword kw">end</span>
|
391
281
|
</pre>
|
392
282
|
</div>
|
@@ -395,53 +285,26 @@ get all classes
|
|
395
285
|
|
396
286
|
<h3 class="section-bar">Protected Instance methods</h3>
|
397
287
|
|
398
|
-
<div id="method-
|
399
|
-
<a name="
|
400
|
-
|
401
|
-
<div class="method-heading">
|
402
|
-
<a href="#M000013" class="method-signature">
|
403
|
-
<span class="method-name">freq_table</span><span class="method-args">()</span>
|
404
|
-
</a>
|
405
|
-
</div>
|
406
|
-
|
407
|
-
<div class="method-description">
|
408
|
-
<p><a class="source-toggle" href="#"
|
409
|
-
onclick="toggleCode('M000013-source');return false;">[Source]</a></p>
|
410
|
-
<div class="method-source-code" id="M000013-source">
|
411
|
-
<pre>
|
412
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 120</span>
|
413
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
|
414
|
-
<span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
|
415
|
-
<span class="ruby-keyword kw">end</span>
|
416
|
-
</pre>
|
417
|
-
</div>
|
418
|
-
</div>
|
419
|
-
</div>
|
420
|
-
|
421
|
-
<div id="method-M000010" class="method-detail">
|
422
|
-
<a name="M000010"></a>
|
288
|
+
<div id="method-M000008" class="method-detail">
|
289
|
+
<a name="M000008"></a>
|
423
290
|
|
424
291
|
<div class="method-heading">
|
425
|
-
<a href="#
|
426
|
-
<span class="method-name">get_word_probs</span><span class="method-args">(word
|
292
|
+
<a href="#M000008" class="method-signature">
|
293
|
+
<span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
|
427
294
|
</a>
|
428
295
|
</div>
|
429
296
|
|
430
297
|
<div class="method-description">
|
431
298
|
<p><a class="source-toggle" href="#"
|
432
|
-
onclick="toggleCode('
|
433
|
-
<div class="method-source-code" id="
|
299
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
300
|
+
<div class="method-source-code" id="M000008-source">
|
434
301
|
<pre>
|
435
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line
|
436
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span
|
437
|
-
<span class="ruby-identifier">probs</span> =
|
438
|
-
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
<span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
|
443
|
-
<span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
|
444
|
-
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">classname</span>].<span class="ruby-identifier">word_count</span>
|
302
|
+
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 65</span>
|
303
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
|
304
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
|
305
|
+
<span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
|
306
|
+
<span class="ruby-comment cmt"># use a laplacian smoother</span>
|
307
|
+
<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
|
445
308
|
}
|
446
309
|
<span class="ruby-identifier">probs</span>
|
447
310
|
<span class="ruby-keyword kw">end</span>
|
@@ -450,58 +313,6 @@ get all classes
|
|
450
313
|
</div>
|
451
314
|
</div>
|
452
315
|
|
453
|
-
<div id="method-M000011" class="method-detail">
|
454
|
-
<a name="M000011"></a>
|
455
|
-
|
456
|
-
<div class="method-heading">
|
457
|
-
<a href="#M000011" class="method-signature">
|
458
|
-
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
459
|
-
</a>
|
460
|
-
</div>
|
461
|
-
|
462
|
-
<div class="method-description">
|
463
|
-
<p><a class="source-toggle" href="#"
|
464
|
-
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
465
|
-
<div class="method-source-code" id="M000011-source">
|
466
|
-
<pre>
|
467
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 106</span>
|
468
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
469
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@ftablename</span>
|
470
|
-
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@ftablename</span>, <span class="ruby-value str">"classes"</span>, <span class="ruby-value str">"total"</span>
|
471
|
-
<span class="ruby-keyword kw">end</span>
|
472
|
-
|
473
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@stablename</span>
|
474
|
-
<span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@stablename</span>, <span class="ruby-value str">"totals"</span>
|
475
|
-
<span class="ruby-keyword kw">end</span>
|
476
|
-
<span class="ruby-keyword kw">end</span>
|
477
|
-
</pre>
|
478
|
-
</div>
|
479
|
-
</div>
|
480
|
-
</div>
|
481
|
-
|
482
|
-
<div id="method-M000012" class="method-detail">
|
483
|
-
<a name="M000012"></a>
|
484
|
-
|
485
|
-
<div class="method-heading">
|
486
|
-
<a href="#M000012" class="method-signature">
|
487
|
-
<span class="method-name">summary_table</span><span class="method-args">()</span>
|
488
|
-
</a>
|
489
|
-
</div>
|
490
|
-
|
491
|
-
<div class="method-description">
|
492
|
-
<p><a class="source-toggle" href="#"
|
493
|
-
onclick="toggleCode('M000012-source');return false;">[Source]</a></p>
|
494
|
-
<div class="method-source-code" id="M000012-source">
|
495
|
-
<pre>
|
496
|
-
<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 116</span>
|
497
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
|
498
|
-
<span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
|
499
|
-
<span class="ruby-keyword kw">end</span>
|
500
|
-
</pre>
|
501
|
-
</div>
|
502
|
-
</div>
|
503
|
-
</div>
|
504
|
-
|
505
316
|
|
506
317
|
</div>
|
507
318
|
|