ankusa 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +80 -6
- data/Rakefile +22 -10
- data/docs/classes/Ankusa.html +29 -1
- data/docs/classes/Ankusa/CassandraStorage.html +615 -0
- data/docs/classes/Ankusa/Classifier.html +23 -131
- data/docs/classes/Ankusa/HBaseStorage.html +102 -102
- data/docs/classes/Ankusa/KLDivergenceClassifier.html +194 -0
- data/docs/classes/Ankusa/MemoryStorage.html +84 -84
- data/docs/classes/Ankusa/NaiveBayesClassifier.html +231 -0
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +132 -11
- data/docs/files/lib/ankusa/cassandra_storage_rb.html +108 -0
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/kl_divergence_rb.html +101 -0
- data/docs/files/lib/ankusa/naive_bayes_rb.html +101 -0
- data/docs/files/lib/ankusa_rb.html +3 -3
- data/docs/fr_class_index.html +3 -0
- data/docs/fr_file_index.html +3 -0
- data/docs/fr_method_index.html +59 -42
- data/lib/ankusa.rb +2 -2
- data/lib/ankusa/cassandra_storage.rb +194 -0
- data/lib/ankusa/classifier.rb +1 -39
- data/lib/ankusa/kl_divergence.rb +31 -0
- data/lib/ankusa/naive_bayes.rb +46 -0
- metadata +19 -26
@@ -0,0 +1,231 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: Ankusa::NaiveBayesClassifier</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">Ankusa::NaiveBayesClassifier</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../../files/lib/ankusa/naive_bayes_rb.html">
|
59
|
+
lib/ankusa/naive_bayes.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
</div>
|
84
|
+
|
85
|
+
<div id="method-list">
|
86
|
+
<h3 class="section-bar">Methods</h3>
|
87
|
+
|
88
|
+
<div class="name-list">
|
89
|
+
<a href="#M000025">classifications</a>
|
90
|
+
<a href="#M000024">classify</a>
|
91
|
+
<a href="#M000026">log_likelihoods</a>
|
92
|
+
</div>
|
93
|
+
</div>
|
94
|
+
|
95
|
+
</div>
|
96
|
+
|
97
|
+
|
98
|
+
<!-- if includes -->
|
99
|
+
<div id="includes">
|
100
|
+
<h3 class="section-bar">Included Modules</h3>
|
101
|
+
|
102
|
+
<div id="includes-list">
|
103
|
+
<span class="include-name"><a href="Classifier.html">Classifier</a></span>
|
104
|
+
</div>
|
105
|
+
</div>
|
106
|
+
|
107
|
+
<div id="section">
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
<!-- if method_list -->
|
117
|
+
<div id="methods">
|
118
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
119
|
+
|
120
|
+
<div id="method-M000025" class="method-detail">
|
121
|
+
<a name="M000025"></a>
|
122
|
+
|
123
|
+
<div class="method-heading">
|
124
|
+
<a href="#M000025" class="method-signature">
|
125
|
+
<span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
|
126
|
+
</a>
|
127
|
+
</div>
|
128
|
+
|
129
|
+
<div class="method-description">
|
130
|
+
<p>
|
131
|
+
Classes is an array of classes to look at
|
132
|
+
</p>
|
133
|
+
<p><a class="source-toggle" href="#"
|
134
|
+
onclick="toggleCode('M000025-source');return false;">[Source]</a></p>
|
135
|
+
<div class="method-source-code" id="M000025-source">
|
136
|
+
<pre>
|
137
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 12</span>
|
138
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
139
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
|
140
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
141
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
|
142
|
+
}
|
143
|
+
|
144
|
+
<span class="ruby-comment cmt"># normalize to get probs</span>
|
145
|
+
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
146
|
+
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
147
|
+
<span class="ruby-identifier">result</span>
|
148
|
+
<span class="ruby-keyword kw">end</span>
|
149
|
+
</pre>
|
150
|
+
</div>
|
151
|
+
</div>
|
152
|
+
</div>
|
153
|
+
|
154
|
+
<div id="method-M000024" class="method-detail">
|
155
|
+
<a name="M000024"></a>
|
156
|
+
|
157
|
+
<div class="method-heading">
|
158
|
+
<a href="#M000024" class="method-signature">
|
159
|
+
<span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
|
160
|
+
</a>
|
161
|
+
</div>
|
162
|
+
|
163
|
+
<div class="method-description">
|
164
|
+
<p><a class="source-toggle" href="#"
|
165
|
+
onclick="toggleCode('M000024-source');return false;">[Source]</a></p>
|
166
|
+
<div class="method-source-code" id="M000024-source">
|
167
|
+
<pre>
|
168
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 6</span>
|
169
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
|
170
|
+
<span class="ruby-comment cmt"># return the most probable class</span>
|
171
|
+
<span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
172
|
+
<span class="ruby-keyword kw">end</span>
|
173
|
+
</pre>
|
174
|
+
</div>
|
175
|
+
</div>
|
176
|
+
</div>
|
177
|
+
|
178
|
+
<div id="method-M000026" class="method-detail">
|
179
|
+
<a name="M000026"></a>
|
180
|
+
|
181
|
+
<div class="method-heading">
|
182
|
+
<a href="#M000026" class="method-signature">
|
183
|
+
<span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
|
184
|
+
</a>
|
185
|
+
</div>
|
186
|
+
|
187
|
+
<div class="method-description">
|
188
|
+
<p>
|
189
|
+
Classes is an array of classes to look at
|
190
|
+
</p>
|
191
|
+
<p><a class="source-toggle" href="#"
|
192
|
+
onclick="toggleCode('M000026-source');return false;">[Source]</a></p>
|
193
|
+
<div class="method-source-code" id="M000026-source">
|
194
|
+
<pre>
|
195
|
+
<span class="ruby-comment cmt"># File lib/ankusa/naive_bayes.rb, line 25</span>
|
196
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
|
197
|
+
<span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
|
198
|
+
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
199
|
+
|
200
|
+
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
201
|
+
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
202
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
|
203
|
+
}
|
204
|
+
|
205
|
+
<span class="ruby-comment cmt"># add the prior and exponentiate</span>
|
206
|
+
<span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
|
207
|
+
<span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
|
208
|
+
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
209
|
+
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
|
210
|
+
}
|
211
|
+
|
212
|
+
<span class="ruby-identifier">result</span>
|
213
|
+
<span class="ruby-keyword kw">end</span>
|
214
|
+
</pre>
|
215
|
+
</div>
|
216
|
+
</div>
|
217
|
+
</div>
|
218
|
+
|
219
|
+
|
220
|
+
</div>
|
221
|
+
|
222
|
+
|
223
|
+
</div>
|
224
|
+
|
225
|
+
|
226
|
+
<div id="validator-badges">
|
227
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
228
|
+
</div>
|
229
|
+
|
230
|
+
</body>
|
231
|
+
</html>
|
@@ -86,11 +86,11 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
89
|
+
<a href="#M000061">add_text</a>
|
90
|
+
<a href="#M000062">add_word</a>
|
91
|
+
<a href="#M000063">atomize</a>
|
92
|
+
<a href="#M000060">new</a>
|
93
|
+
<a href="#M000064">valid_word?</a>
|
94
94
|
</div>
|
95
95
|
</div>
|
96
96
|
|
@@ -125,19 +125,19 @@
|
|
125
125
|
<div id="methods">
|
126
126
|
<h3 class="section-bar">Public Class methods</h3>
|
127
127
|
|
128
|
-
<div id="method-
|
129
|
-
<a name="
|
128
|
+
<div id="method-M000063" class="method-detail">
|
129
|
+
<a name="M000063"></a>
|
130
130
|
|
131
131
|
<div class="method-heading">
|
132
|
-
<a href="#
|
132
|
+
<a href="#M000063" class="method-signature">
|
133
133
|
<span class="method-name">atomize</span><span class="method-args">(text)</span>
|
134
134
|
</a>
|
135
135
|
</div>
|
136
136
|
|
137
137
|
<div class="method-description">
|
138
138
|
<p><a class="source-toggle" href="#"
|
139
|
-
onclick="toggleCode('
|
140
|
-
<div class="method-source-code" id="
|
139
|
+
onclick="toggleCode('M000063-source');return false;">[Source]</a></p>
|
140
|
+
<div class="method-source-code" id="M000063-source">
|
141
141
|
<pre>
|
142
142
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 33</span>
|
143
143
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
|
@@ -148,19 +148,19 @@
|
|
148
148
|
</div>
|
149
149
|
</div>
|
150
150
|
|
151
|
-
<div id="method-
|
152
|
-
<a name="
|
151
|
+
<div id="method-M000060" class="method-detail">
|
152
|
+
<a name="M000060"></a>
|
153
153
|
|
154
154
|
<div class="method-heading">
|
155
|
-
<a href="#
|
155
|
+
<a href="#M000060" class="method-signature">
|
156
156
|
<span class="method-name">new</span><span class="method-args">(text=nil)</span>
|
157
157
|
</a>
|
158
158
|
</div>
|
159
159
|
|
160
160
|
<div class="method-description">
|
161
161
|
<p><a class="source-toggle" href="#"
|
162
|
-
onclick="toggleCode('
|
163
|
-
<div class="method-source-code" id="
|
162
|
+
onclick="toggleCode('M000060-source');return false;">[Source]</a></p>
|
163
|
+
<div class="method-source-code" id="M000060-source">
|
164
164
|
<pre>
|
165
165
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 9</span>
|
166
166
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword kw">nil</span>)
|
@@ -173,11 +173,11 @@
|
|
173
173
|
</div>
|
174
174
|
</div>
|
175
175
|
|
176
|
-
<div id="method-
|
177
|
-
<a name="
|
176
|
+
<div id="method-M000064" class="method-detail">
|
177
|
+
<a name="M000064"></a>
|
178
178
|
|
179
179
|
<div class="method-heading">
|
180
|
-
<a href="#
|
180
|
+
<a href="#M000064" class="method-signature">
|
181
181
|
<span class="method-name">valid_word?</span><span class="method-args">(word)</span>
|
182
182
|
</a>
|
183
183
|
</div>
|
@@ -187,8 +187,8 @@
|
|
187
187
|
word should be only alphanum chars at this point
|
188
188
|
</p>
|
189
189
|
<p><a class="source-toggle" href="#"
|
190
|
-
onclick="toggleCode('
|
191
|
-
<div class="method-source-code" id="
|
190
|
+
onclick="toggleCode('M000064-source');return false;">[Source]</a></p>
|
191
|
+
<div class="method-source-code" id="M000064-source">
|
192
192
|
<pre>
|
193
193
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 38</span>
|
194
194
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
|
@@ -204,19 +204,19 @@ word should be only alphanum chars at this point
|
|
204
204
|
|
205
205
|
<h3 class="section-bar">Public Instance methods</h3>
|
206
206
|
|
207
|
-
<div id="method-
|
208
|
-
<a name="
|
207
|
+
<div id="method-M000061" class="method-detail">
|
208
|
+
<a name="M000061"></a>
|
209
209
|
|
210
210
|
<div class="method-heading">
|
211
|
-
<a href="#
|
211
|
+
<a href="#M000061" class="method-signature">
|
212
212
|
<span class="method-name">add_text</span><span class="method-args">(text)</span>
|
213
213
|
</a>
|
214
214
|
</div>
|
215
215
|
|
216
216
|
<div class="method-description">
|
217
217
|
<p><a class="source-toggle" href="#"
|
218
|
-
onclick="toggleCode('
|
219
|
-
<div class="method-source-code" id="
|
218
|
+
onclick="toggleCode('M000061-source');return false;">[Source]</a></p>
|
219
|
+
<div class="method-source-code" id="M000061-source">
|
220
220
|
<pre>
|
221
221
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 15</span>
|
222
222
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
|
@@ -235,19 +235,19 @@ word should be only alphanum chars at this point
|
|
235
235
|
</div>
|
236
236
|
</div>
|
237
237
|
|
238
|
-
<div id="method-
|
239
|
-
<a name="
|
238
|
+
<div id="method-M000062" class="method-detail">
|
239
|
+
<a name="M000062"></a>
|
240
240
|
|
241
241
|
<div class="method-heading">
|
242
|
-
<a href="#
|
242
|
+
<a href="#M000062" class="method-signature">
|
243
243
|
<span class="method-name">add_word</span><span class="method-args">(word)</span>
|
244
244
|
</a>
|
245
245
|
</div>
|
246
246
|
|
247
247
|
<div class="method-description">
|
248
248
|
<p><a class="source-toggle" href="#"
|
249
|
-
onclick="toggleCode('
|
250
|
-
<div class="method-source-code" id="
|
249
|
+
onclick="toggleCode('M000062-source');return false;">[Source]</a></p>
|
250
|
+
<div class="method-source-code" id="M000062-source">
|
251
251
|
<pre>
|
252
252
|
<span class="ruby-comment cmt"># File lib/ankusa/hasher.rb, line 27</span>
|
253
253
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
|
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Sun, 12 Dec 2010 13:34:32 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sun Dec 12 13:30:40 -0500 2010</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -72,31 +72,45 @@
|
|
72
72
|
<h1>ankusa</h1>
|
73
73
|
<p>
|
74
74
|
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
|
-
that
|
76
|
-
backend, the training corpus can be many
|
75
|
+
that can use either Hadoop‘s HBase or Cassandra for storage. Because
|
76
|
+
it uses HBase or Cassandra as a backend, the training corpus can be many
|
77
|
+
terabytes in size.
|
77
78
|
</p>
|
78
79
|
<p>
|
79
|
-
<a href="../classes/Ankusa.html">Ankusa</a> currently
|
80
|
-
classifier. It ignores common words
|
81
|
-
others. Additionally, it uses Laplacian
|
82
|
-
|
80
|
+
<a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
|
81
|
+
Bayes and Kullback-Leibler divergence classifier. It ignores common words
|
82
|
+
(a.k.a, stop words) and stems all others. Additionally, it uses Laplacian
|
83
|
+
smoothing in both classification methods.
|
83
84
|
</p>
|
84
85
|
<h2>Installation</h2>
|
85
86
|
<p>
|
86
|
-
First, install HBase
|
87
|
-
|
87
|
+
First, install HBase/Hadoop or Cassandra (>= 0.7.0-rc2). Then, install
|
88
|
+
the appropriate gem:
|
89
|
+
</p>
|
90
|
+
<pre>
|
91
|
+
gem install hbaserb
|
92
|
+
# or
|
93
|
+
gem install cassandra
|
94
|
+
</pre>
|
95
|
+
<p>
|
96
|
+
If you‘re using HBase, make sure the HBase Thrift interface has been
|
97
|
+
started as well. Then:
|
88
98
|
</p>
|
89
99
|
<pre>
|
90
100
|
gem install ankusa
|
91
101
|
</pre>
|
92
102
|
<h2>Basic Usage</h2>
|
103
|
+
<p>
|
104
|
+
Using the naive Bayes classifier:
|
105
|
+
</p>
|
93
106
|
<pre>
|
94
107
|
require 'rubygems'
|
95
108
|
require 'ankusa'
|
109
|
+
require 'ankusa/hbase_storage'
|
96
110
|
|
97
111
|
# connect to HBase
|
98
112
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
99
|
-
c = Ankusa::
|
113
|
+
c = Ankusa::NaiveBayesClassifier.new storage
|
100
114
|
|
101
115
|
# Each of these calls will return a bag-of-words
|
102
116
|
# has with stemmed words as keys and counts as values
|
@@ -116,11 +130,118 @@ been started as well. Then:
|
|
116
130
|
puts c.log_likelihoods "This is some spammy text"
|
117
131
|
|
118
132
|
# get a list of all classes
|
119
|
-
puts c.
|
133
|
+
puts c.classnames
|
120
134
|
|
121
135
|
# close connection
|
122
136
|
storage.close
|
123
137
|
</pre>
|
138
|
+
<h2>KL Diverence Classifier</h2>
|
139
|
+
<p>
|
140
|
+
There is a Kullback–Leibler divergence classifier as well. KL divergence
|
141
|
+
is a distance measure (though not a true metric because it does not satisfy
|
142
|
+
the triangle inequality). The KL classifier simply measures the relative
|
143
|
+
entropy between the text you want to classify and each of the classes. The
|
144
|
+
class with the shortest "distance" is the best class. You may
|
145
|
+
find that for a especially large corpus it may be slightly faster to use
|
146
|
+
this classifier (since prior probablities are never calculated, only
|
147
|
+
likelihoods).
|
148
|
+
</p>
|
149
|
+
<p>
|
150
|
+
The API is the same as the NaiveBayesClassifier, except rather than calling
|
151
|
+
"classifications" if you want actual numbers you call
|
152
|
+
"distances".
|
153
|
+
</p>
|
154
|
+
<pre>
|
155
|
+
require 'rubygems'
|
156
|
+
require 'ankusa'
|
157
|
+
require 'ankusa/hbase_storage'
|
158
|
+
|
159
|
+
# connect to HBase
|
160
|
+
storage = Ankusa::HBaseStorage.new 'localhost'
|
161
|
+
c = Ankusa::KLDivergenceClassifier.new storage
|
162
|
+
|
163
|
+
# Each of these calls will return a bag-of-words
|
164
|
+
# has with stemmed words as keys and counts as values
|
165
|
+
c.train :spam, "This is some spammy text"
|
166
|
+
c.train :good, "This is not the bad stuff"
|
167
|
+
|
168
|
+
# This will return the most likely class (as symbol)
|
169
|
+
puts c.classify "This is some spammy text"
|
170
|
+
|
171
|
+
# This will return Hash with classes as keys and
|
172
|
+
# distances >= 0 as values
|
173
|
+
puts c.distances "This is some spammy text"
|
174
|
+
|
175
|
+
# get a list of all classes
|
176
|
+
puts c.classnames
|
177
|
+
|
178
|
+
# close connection
|
179
|
+
storage.close
|
180
|
+
</pre>
|
181
|
+
<h2>Storage Methods</h2>
|
182
|
+
<p>
|
183
|
+
<a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
|
184
|
+
interface that has been implemented for HBase, Cassandra, and in-memory
|
185
|
+
storage.
|
186
|
+
</p>
|
187
|
+
<p>
|
188
|
+
Memory storage can be used when you have a very small corpora
|
189
|
+
</p>
|
190
|
+
<pre>
|
191
|
+
require 'ankusa/memory_storage'
|
192
|
+
storage = Ankusa::MemoryStorage.new
|
193
|
+
</pre>
|
194
|
+
<p>
|
195
|
+
HBase storage:
|
196
|
+
</p>
|
197
|
+
<pre>
|
198
|
+
require 'ankusa/hbase_storage'
|
199
|
+
# defaults: host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary"
|
200
|
+
storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
|
201
|
+
</pre>
|
202
|
+
<p>
|
203
|
+
For Cassandra storage:
|
204
|
+
</p>
|
205
|
+
<ul>
|
206
|
+
<li>You will need Cassandra version 0.7.0-rc2 or greater.
|
207
|
+
|
208
|
+
</li>
|
209
|
+
<li>You will need to set a max number classes since current implementation of
|
210
|
+
the Ruby Cassandra client doesn‘t support table scans.
|
211
|
+
|
212
|
+
</li>
|
213
|
+
<li>Prior to using the Cassandra storage you will need to run the following
|
214
|
+
command from the cassandra-cli: "create keyspace ankusa with
|
215
|
+
replication_factor = 1". This should be fixed with a new release
|
216
|
+
candidate for Cassandra.
|
217
|
+
|
218
|
+
</li>
|
219
|
+
</ul>
|
220
|
+
<p>
|
221
|
+
To use the Cassandra storage class:
|
222
|
+
</p>
|
223
|
+
<pre>
|
224
|
+
require 'ankusa/cassandra_storage'
|
225
|
+
# defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
|
226
|
+
storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
|
227
|
+
</pre>
|
228
|
+
<h2>Running Tests</h2>
|
229
|
+
<p>
|
230
|
+
You can run the tests for any of the three storage methods. For instance,
|
231
|
+
for memory storage:
|
232
|
+
</p>
|
233
|
+
<pre>
|
234
|
+
rake test_memory
|
235
|
+
</pre>
|
236
|
+
<p>
|
237
|
+
For the other methods you will need to edit the file test/config.yml and
|
238
|
+
set the configuration params. Then:
|
239
|
+
</p>
|
240
|
+
<pre>
|
241
|
+
rake test_hbase
|
242
|
+
# or
|
243
|
+
rake test_cassandra
|
244
|
+
</pre>
|
124
245
|
|
125
246
|
</div>
|
126
247
|
|