ankusa 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +80 -6
- data/Rakefile +22 -10
- data/docs/classes/Ankusa.html +29 -1
- data/docs/classes/Ankusa/CassandraStorage.html +615 -0
- data/docs/classes/Ankusa/Classifier.html +23 -131
- data/docs/classes/Ankusa/HBaseStorage.html +102 -102
- data/docs/classes/Ankusa/KLDivergenceClassifier.html +194 -0
- data/docs/classes/Ankusa/MemoryStorage.html +84 -84
- data/docs/classes/Ankusa/NaiveBayesClassifier.html +231 -0
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +132 -11
- data/docs/files/lib/ankusa/cassandra_storage_rb.html +108 -0
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/kl_divergence_rb.html +101 -0
- data/docs/files/lib/ankusa/naive_bayes_rb.html +101 -0
- data/docs/files/lib/ankusa_rb.html +3 -3
- data/docs/fr_class_index.html +3 -0
- data/docs/fr_file_index.html +3 -0
- data/docs/fr_method_index.html +59 -42
- data/lib/ankusa.rb +2 -2
- data/lib/ankusa/cassandra_storage.rb +194 -0
- data/lib/ankusa/classifier.rb +1 -39
- data/lib/ankusa/kl_divergence.rb +31 -0
- data/lib/ankusa/naive_bayes.rb +46 -0
- metadata +19 -26
@@ -0,0 +1,231 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: Ankusa::NaiveBayesClassifier</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">Ankusa::NaiveBayesClassifier</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../../files/lib/ankusa/naive_bayes_rb.html">
|
59
|
+
lib/ankusa/naive_bayes.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
</div>
|
84
|
+
|
85
|
+
<div id="method-list">
|
86
|
+
<h3 class="section-bar">Methods</h3>
|
87
|
+
|
88
|
+
<div class="name-list">
|
89
|
+
<a href="#M000025">classifications</a>
|
90
|
+
<a href="#M000024">classify</a>
|
91
|
+
<a href="#M000026">log_likelihoods</a>
|
92
|
+
</div>
|
93
|
+
</div>
|
94
|
+
|
95
|
+
</div>
|
96
|
+
|
97
|
+
|
98
|
+
<!-- if includes -->
|
99
|
+
<div id="includes">
|
100
|
+
<h3 class="section-bar">Included Modules</h3>
|
101
|
+
|
102
|
+
<div id="includes-list">
|
103
|
+
<span class="include-name"><a href="Classifier.html">Classifier</a></span>
|
104
|
+
</div>
|
105
|
+
</div>
|
106
|
+
|
107
|
+
<div id="section">
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
<!-- if method_list -->
|
117
|
+
<div id="methods">
|
118
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
119
|
+
|
120
|
+
<div id="method-M000025" class="method-detail">
|
121
|
+
<a name="M000025"></a>
|
122
|
+
|
123
|
+
<div class="method-heading">
|
124
|
+
<a href="#M000025" class="method-signature">
|
125
|
+
<span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
|
126
|
+
</a>
|
127
|
+
</div>
|
128
|
+
|
129
|
+
<div class="method-description">
|
130
|
+
<p>
|
131
|
+
Classes is an array of classes to look at
|
132
|
+
</p>
|
133
|
+
<p><a class="source-toggle" href="#"
|
134
|
+
onclick="toggleCode('M000025-source');return false;">[Source]</a></p>
|
135
|
+
<div class="method-source-code" id="M000025-source">
|
136
|
+
<pre>
|
137
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 12</span>
|
138
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
139
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
|
140
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
141
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
|
142
|
+
}
|
143
|
+
|
144
|
+
<span class="ruby-comment cmt"># normalize to get probs</span>
|
145
|
+
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
146
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
147
|
+
<span class="ruby-identifier">result</span>
|
148
|
+
<span class="ruby-keyword kw">end</span>
|
149
|
+
</pre>
|
150
|
+
</div>
|
151
|
+
</div>
|
152
|
+
</div>
|
153
|
+
|
154
|
+
<div id="method-M000024" class="method-detail">
|
155
|
+
<a name="M000024"></a>
|
156
|
+
|
157
|
+
<div class="method-heading">
|
158
|
+
<a href="#M000024" class="method-signature">
|
159
|
+
<span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
|
160
|
+
</a>
|
161
|
+
</div>
|
162
|
+
|
163
|
+
<div class="method-description">
|
164
|
+
<p><a class="source-toggle" href="#"
|
165
|
+
onclick="toggleCode('M000024-source');return false;">[Source]</a></p>
|
166
|
+
<div class="method-source-code" id="M000024-source">
|
167
|
+
<pre>
|
168
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 6</span>
|
169
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
|
170
|
+
<span class="ruby-comment cmt"># return the most probable class</span>
|
171
|
+
<span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
172
|
+
<span class="ruby-keyword kw">end</span>
|
173
|
+
</pre>
|
174
|
+
</div>
|
175
|
+
</div>
|
176
|
+
</div>
|
177
|
+
|
178
|
+
<div id="method-M000026" class="method-detail">
|
179
|
+
<a name="M000026"></a>
|
180
|
+
|
181
|
+
<div class="method-heading">
|
182
|
+
<a href="#M000026" class="method-signature">
|
183
|
+
<span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
|
184
|
+
</a>
|
185
|
+
</div>
|
186
|
+
|
187
|
+
<div class="method-description">
|
188
|
+
<p>
|
189
|
+
Classes is an array of classes to look at
|
190
|
+
</p>
|
191
|
+
<p><a class="source-toggle" href="#"
|
192
|
+
onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
|
193
|
+
<div class="method-source-code" id="M000026-source">
|
194
|
+
<pre>
|
195
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 25</span>
|
196
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
197
|
+
<span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
|
198
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
199
|
+
|
200
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
201
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
202
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
203
|
+
}
|
204
|
+
|
205
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
206
|
+
<span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
|
207
|
+
<span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
|
208
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
209
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
|
210
|
+
}
|
211
|
+
|
212
|
+
<span class="ruby-identifier">result</span>
|
213
|
+
<span class="ruby-keyword kw">end</span>
|
214
|
+
</pre>
|
215
|
+
</div>
|
216
|
+
</div>
|
217
|
+
</div>
|
218
|
+
|
219
|
+
|
220
|
+
</div>
|
221
|
+
|
222
|
+
|
223
|
+
</div>
|
224
|
+
|
225
|
+
|
226
|
+
<div id="validator-badges">
|
227
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
228
|
+
</div>
|
229
|
+
|
230
|
+
</body>
|
231
|
+
</html>
|
@@ -86,11 +86,11 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
89
|
+
<a href="#M000061">add_text</a>
|
90
|
+
<a href="#M000062">add_word</a>
|
91
|
+
<a href="#M000063">atomize</a>
|
92
|
+
<a href="#M000060">new</a>
|
93
|
+
<a href="#M000064">valid_word?</a>
|
94
94
|
</div>
|
95
95
|
</div>
|
96
96
|
|
@@ -125,19 +125,19 @@
|
|
125
125
|
<div id="methods">
|
126
126
|
<h3 class="section-bar">Public Class methods</h3>
|
127
127
|
|
128
|
-
<div id="method-
|
129
|
-
<a name="
|
128
|
+
<div id="method-M000063" class="method-detail">
|
129
|
+
<a name="M000063"></a>
|
130
130
|
|
131
131
|
<div class="method-heading">
|
132
|
-
<a href="#
|
132
|
+
<a href="#M000063" class="method-signature">
|
133
133
|
<span class="method-name">atomize</span><span class="method-args">(text)</span>
|
134
134
|
</a>
|
135
135
|
</div>
|
136
136
|
|
137
137
|
<div class="method-description">
|
138
138
|
<p><a class="source-toggle" href="#"
|
139
|
-
onclick="toggleCode('
|
140
|
-
<div class="method-source-code" id="
|
139
|
+
onclick="toggleCode('M000063-source');return false;">[Source]</a></p>
|
140
|
+
<div class="method-source-code" id="M000063-source">
|
141
141
|
<pre>
|
142
142
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 33</span>
|
143
143
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
|
@@ -148,19 +148,19 @@
|
|
148
148
|
</div>
|
149
149
|
</div>
|
150
150
|
|
151
|
-
<div id="method-
|
152
|
-
<a name="
|
151
|
+
<div id="method-M000060" class="method-detail">
|
152
|
+
<a name="M000060"></a>
|
153
153
|
|
154
154
|
<div class="method-heading">
|
155
|
-
<a href="#
|
155
|
+
<a href="#M000060" class="method-signature">
|
156
156
|
<span class="method-name">new</span><span class="method-args">(text=nil)</span>
|
157
157
|
</a>
|
158
158
|
</div>
|
159
159
|
|
160
160
|
<div class="method-description">
|
161
161
|
<p><a class="source-toggle" href="#"
|
162
|
-
onclick="toggleCode('
|
163
|
-
<div class="method-source-code" id="
|
162
|
+
onclick="toggleCode('M000060-source');return false;">[Source]</a></p>
|
163
|
+
<div class="method-source-code" id="M000060-source">
|
164
164
|
<pre>
|
165
165
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
|
166
166
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
|
@@ -173,11 +173,11 @@
|
|
173
173
|
</div>
|
174
174
|
</div>
|
175
175
|
|
176
|
-
<div id="method-
|
177
|
-
<a name="
|
176
|
+
<div id="method-M000064" class="method-detail">
|
177
|
+
<a name="M000064"></a>
|
178
178
|
|
179
179
|
<div class="method-heading">
|
180
|
-
<a href="#
|
180
|
+
<a href="#M000064" class="method-signature">
|
181
181
|
<span class="method-name">valid_word?</span><span class="method-args">(word)</span>
|
182
182
|
</a>
|
183
183
|
</div>
|
@@ -187,8 +187,8 @@
|
|
187
187
|
word should be only alphanum chars at this point
|
188
188
|
</p>
|
189
189
|
<p><a class="source-toggle" href="#"
|
190
|
-
onclick="toggleCode('
|
191
|
-
<div class="method-source-code" id="
|
190
|
+
onclick="toggleCode('M000064-source');return false;">[Source]</a></p>
|
191
|
+
<div class="method-source-code" id="M000064-source">
|
192
192
|
<pre>
|
193
193
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 38</span>
|
194
194
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
|
@@ -204,19 +204,19 @@ word should be only alphanum chars at this point
|
|
204
204
|
|
205
205
|
<h3 class="section-bar">Public Instance methods</h3>
|
206
206
|
|
207
|
-
<div id="method-
|
208
|
-
<a name="
|
207
|
+
<div id="method-M000061" class="method-detail">
|
208
|
+
<a name="M000061"></a>
|
209
209
|
|
210
210
|
<div class="method-heading">
|
211
|
-
<a href="#
|
211
|
+
<a href="#M000061" class="method-signature">
|
212
212
|
<span class="method-name">add_text</span><span class="method-args">(text)</span>
|
213
213
|
</a>
|
214
214
|
</div>
|
215
215
|
|
216
216
|
<div class="method-description">
|
217
217
|
<p><a class="source-toggle" href="#"
|
218
|
-
onclick="toggleCode('
|
219
|
-
<div class="method-source-code" id="
|
218
|
+
onclick="toggleCode('M000061-source');return false;">[Source]</a></p>
|
219
|
+
<div class="method-source-code" id="M000061-source">
|
220
220
|
<pre>
|
221
221
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
|
222
222
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
|
@@ -235,19 +235,19 @@ word should be only alphanum chars at this point
|
|
235
235
|
</div>
|
236
236
|
</div>
|
237
237
|
|
238
|
-
<div id="method-
|
239
|
-
<a name="
|
238
|
+
<div id="method-M000062" class="method-detail">
|
239
|
+
<a name="M000062"></a>
|
240
240
|
|
241
241
|
<div class="method-heading">
|
242
|
-
<a href="#
|
242
|
+
<a href="#M000062" class="method-signature">
|
243
243
|
<span class="method-name">add_word</span><span class="method-args">(word)</span>
|
244
244
|
</a>
|
245
245
|
</div>
|
246
246
|
|
247
247
|
<div class="method-description">
|
248
248
|
<p><a class="source-toggle" href="#"
|
249
|
-
onclick="toggleCode('
|
250
|
-
<div class="method-source-code" id="
|
249
|
+
onclick="toggleCode('M000062-source');return false;">[Source]</a></p>
|
250
|
+
<div class="method-source-code" id="M000062-source">
|
251
251
|
<pre>
|
252
252
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 27</span>
|
253
253
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
|
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Sun, 12 Dec 2010 13:34:32 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sun Dec 12 13:30:40 -0500 2010</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -72,31 +72,45 @@
|
|
72
72
|
<h1>ankusa</h1>
|
73
73
|
<p>
|
74
74
|
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
|
-
that
|
76
|
-
backend, the training corpus can be many
|
75
|
+
that can use either Hadoop‘s HBase or Cassandra for storage. Because
|
76
|
+
it uses HBase or Cassandra as a backend, the training corpus can be many
|
77
|
+
terabytes in size.
|
77
78
|
</p>
|
78
79
|
<p>
|
79
|
-
<a href="../classes/Ankusa.html">Ankusa</a> currently
|
80
|
-
classifier. It ignores common words
|
81
|
-
others. Additionally, it uses Laplacian
|
82
|
-
|
80
|
+
<a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
|
81
|
+
Bayes and Kullback-Leibler divergence classifier. It ignores common words
|
82
|
+
(a.k.a, stop words) and stems all others. Additionally, it uses Laplacian
|
83
|
+
smoothing in both classification methods.
|
83
84
|
</p>
|
84
85
|
<h2>Installation</h2>
|
85
86
|
<p>
|
86
|
-
First, install HBase
|
87
|
-
|
87
|
+
First, install HBase/Hadoop or Cassandra (>= 0.7.0-rc2). Then, install
|
88
|
+
the appropriate gem:
|
89
|
+
</p>
|
90
|
+
<pre>
|
91
|
+
gem install hbaserb
|
92
|
+
# or
|
93
|
+
gem install cassandra
|
94
|
+
</pre>
|
95
|
+
<p>
|
96
|
+
If you‘re using HBase, make sure the HBase Thrift interface has been
|
97
|
+
started as well. Then:
|
88
98
|
</p>
|
89
99
|
<pre>
|
90
100
|
gem install ankusa
|
91
101
|
</pre>
|
92
102
|
<h2>Basic Usage</h2>
|
103
|
+
<p>
|
104
|
+
Using the naive Bayes classifier:
|
105
|
+
</p>
|
93
106
|
<pre>
|
94
107
|
require 'rubygems'
|
95
108
|
require 'ankusa'
|
109
|
+
require 'ankusa/hbase_storage'
|
96
110
|
|
97
111
|
# connect to HBase
|
98
112
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
99
|
-
c = Ankusa::
|
113
|
+
c = Ankusa::NaiveBayesClassifier.new storage
|
100
114
|
|
101
115
|
# Each of these calls will return a bag-of-words
|
102
116
|
# has with stemmed words as keys and counts as values
|
@@ -116,11 +130,118 @@ been started as well. Then:
|
|
116
130
|
puts c.log_likelihoods "This is some spammy text"
|
117
131
|
|
118
132
|
# get a list of all classes
|
119
|
-
puts c.
|
133
|
+
puts c.classnames
|
120
134
|
|
121
135
|
# close connection
|
122
136
|
storage.close
|
123
137
|
</pre>
|
138
|
+
<h2>KL Diverence Classifier</h2>
|
139
|
+
<p>
|
140
|
+
There is a Kullback–Leibler divergence classifier as well. KL divergence
|
141
|
+
is a distance measure (though not a true metric because it does not satisfy
|
142
|
+
the triangle inequality). The KL classifier simply measures the relative
|
143
|
+
entropy between the text you want to classify and each of the classes. The
|
144
|
+
class with the shortest "distance" is the best class. You may
|
145
|
+
find that for a especially large corpus it may be slightly faster to use
|
146
|
+
this classifier (since prior probablities are never calculated, only
|
147
|
+
likelihoods).
|
148
|
+
</p>
|
149
|
+
<p>
|
150
|
+
The API is the same as the NaiveBayesClassifier, except rather than calling
|
151
|
+
"classifications" if you want actual numbers you call
|
152
|
+
"distances".
|
153
|
+
</p>
|
154
|
+
<pre>
|
155
|
+
require 'rubygems'
|
156
|
+
require 'ankusa'
|
157
|
+
require 'ankusa/hbase_storage'
|
158
|
+
|
159
|
+
# connect to HBase
|
160
|
+
storage = Ankusa::HBaseStorage.new 'localhost'
|
161
|
+
c = Ankusa::KLDivergenceClassifier.new storage
|
162
|
+
|
163
|
+
# Each of these calls will return a bag-of-words
|
164
|
+
# has with stemmed words as keys and counts as values
|
165
|
+
c.train :spam, "This is some spammy text"
|
166
|
+
c.train :good, "This is not the bad stuff"
|
167
|
+
|
168
|
+
# This will return the most likely class (as symbol)
|
169
|
+
puts c.classify "This is some spammy text"
|
170
|
+
|
171
|
+
# This will return Hash with classes as keys and
|
172
|
+
# distances >= 0 as values
|
173
|
+
puts c.distances "This is some spammy text"
|
174
|
+
|
175
|
+
# get a list of all classes
|
176
|
+
puts c.classnames
|
177
|
+
|
178
|
+
# close connection
|
179
|
+
storage.close
|
180
|
+
</pre>
|
181
|
+
<h2>Storage Methods</h2>
|
182
|
+
<p>
|
183
|
+
<a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
|
184
|
+
interface that has been implemented for HBase, Cassandra, and in-memory
|
185
|
+
storage.
|
186
|
+
</p>
|
187
|
+
<p>
|
188
|
+
Memory storage can be used when you have a very small corpora
|
189
|
+
</p>
|
190
|
+
<pre>
|
191
|
+
require 'ankusa/memory_storage'
|
192
|
+
storage = Ankusa::MemoryStorage.new
|
193
|
+
</pre>
|
194
|
+
<p>
|
195
|
+
HBase storage:
|
196
|
+
</p>
|
197
|
+
<pre>
|
198
|
+
require 'ankusa/hbase_storage'
|
199
|
+
# defaults: host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary"
|
200
|
+
storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
|
201
|
+
</pre>
|
202
|
+
<p>
|
203
|
+
For Cassandra storage:
|
204
|
+
</p>
|
205
|
+
<ul>
|
206
|
+
<li>You will need Cassandra version 0.7.0-rc2 or greater.
|
207
|
+
|
208
|
+
</li>
|
209
|
+
<li>You will need to set a max number classes since current implementation of
|
210
|
+
the Ruby Cassandra client doesn‘t support table scans.
|
211
|
+
|
212
|
+
</li>
|
213
|
+
<li>Prior to using the Cassandra storage you will need to run the following
|
214
|
+
command from the cassandra-cli: "create keyspace ankusa with
|
215
|
+
replication_factor = 1". This should be fixed with a new release
|
216
|
+
candidate for Cassandra.
|
217
|
+
|
218
|
+
</li>
|
219
|
+
</ul>
|
220
|
+
<p>
|
221
|
+
To use the Cassandra storage class:
|
222
|
+
</p>
|
223
|
+
<pre>
|
224
|
+
require 'ankusa/cassandra_storage'
|
225
|
+
# defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
|
226
|
+
storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
|
227
|
+
</pre>
|
228
|
+
<h2>Running Tests</h2>
|
229
|
+
<p>
|
230
|
+
You can run the tests for any of the three storage methods. For instance,
|
231
|
+
for memory storage:
|
232
|
+
</p>
|
233
|
+
<pre>
|
234
|
+
rake test_memory
|
235
|
+
</pre>
|
236
|
+
<p>
|
237
|
+
For the other methods you will need to edit the file test/config.yml and
|
238
|
+
set the configuration params. Then:
|
239
|
+
</p>
|
240
|
+
<pre>
|
241
|
+
rake test_hbase
|
242
|
+
# or
|
243
|
+
rake test_cassandra
|
244
|
+
</pre>
|
124
245
|
|
125
246
|
</div>
|
126
247
|
|