ankusa 0.0.13 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/Gemfile.lock +16 -0
- data/README.rdoc +5 -3
- data/Rakefile +5 -5
- data/lib/ankusa/naive_bayes.rb +3 -3
- data/lib/ankusa/version.rb +1 -1
- metadata +36 -100
- data/docs/Ankusa.html +0 -229
- data/docs/Ankusa/CassandraStorage.html +0 -801
- data/docs/Ankusa/Classifier.html +0 -440
- data/docs/Ankusa/FileSystemStorage.html +0 -376
- data/docs/Ankusa/HBaseStorage.html +0 -845
- data/docs/Ankusa/KLDivergenceClassifier.html +0 -265
- data/docs/Ankusa/MemoryStorage.html +0 -672
- data/docs/Ankusa/NaiveBayesClassifier.html +0 -313
- data/docs/Ankusa/TextHash.html +0 -390
- data/docs/README_rdoc.html +0 -268
- data/docs/String.html +0 -241
- data/docs/created.rid +0 -14
- data/docs/images/brick.png +0 -0
- data/docs/images/brick_link.png +0 -0
- data/docs/images/bug.png +0 -0
- data/docs/images/bullet_black.png +0 -0
- data/docs/images/bullet_toggle_minus.png +0 -0
- data/docs/images/bullet_toggle_plus.png +0 -0
- data/docs/images/date.png +0 -0
- data/docs/images/find.png +0 -0
- data/docs/images/loadingAnimation.gif +0 -0
- data/docs/images/macFFBgHack.png +0 -0
- data/docs/images/package.png +0 -0
- data/docs/images/page_green.png +0 -0
- data/docs/images/page_white_text.png +0 -0
- data/docs/images/page_white_width.png +0 -0
- data/docs/images/plugin.png +0 -0
- data/docs/images/ruby.png +0 -0
- data/docs/images/tag_green.png +0 -0
- data/docs/images/wrench.png +0 -0
- data/docs/images/wrench_orange.png +0 -0
- data/docs/images/zoom.png +0 -0
- data/docs/index.html +0 -212
- data/docs/js/darkfish.js +0 -116
- data/docs/js/jquery.js +0 -32
- data/docs/js/quicksearch.js +0 -114
- data/docs/js/thickbox-compressed.js +0 -10
- data/docs/lib/ankusa/cassandra_storage_rb.html +0 -54
- data/docs/lib/ankusa/classifier_rb.html +0 -52
- data/docs/lib/ankusa/extensions_rb.html +0 -54
- data/docs/lib/ankusa/file_system_storage_rb.html +0 -54
- data/docs/lib/ankusa/hasher_rb.html +0 -56
- data/docs/lib/ankusa/hbase_storage_rb.html +0 -54
- data/docs/lib/ankusa/kl_divergence_rb.html +0 -52
- data/docs/lib/ankusa/memory_storage_rb.html +0 -52
- data/docs/lib/ankusa/naive_bayes_rb.html +0 -52
- data/docs/lib/ankusa/stopwords_rb.html +0 -52
- data/docs/lib/ankusa/version_rb.html +0 -52
- data/docs/lib/ankusa_rb.html +0 -64
- data/docs/rdoc.css +0 -759
@@ -1,313 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
3
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
4
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
|
-
<head>
|
6
|
-
<meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
|
7
|
-
|
8
|
-
<title>Class: Ankusa::NaiveBayesClassifier</title>
|
9
|
-
|
10
|
-
<link rel="stylesheet" href="../rdoc.css" type="text/css" media="screen" />
|
11
|
-
|
12
|
-
<script src="../js/jquery.js" type="text/javascript" charset="utf-8"></script>
|
13
|
-
<script src="../js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
|
14
|
-
<script src="../js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
|
15
|
-
<script src="../js/darkfish.js" type="text/javascript" charset="utf-8"></script>
|
16
|
-
|
17
|
-
</head>
|
18
|
-
<body id="top" class="class">
|
19
|
-
|
20
|
-
<div id="metadata">
|
21
|
-
<div id="home-metadata">
|
22
|
-
<div id="home-section" class="section">
|
23
|
-
<h3 class="section-header">
|
24
|
-
<a href="../index.html">Home</a>
|
25
|
-
<a href="../index.html#classes">Classes</a>
|
26
|
-
<a href="../index.html#methods">Methods</a>
|
27
|
-
</h3>
|
28
|
-
</div>
|
29
|
-
</div>
|
30
|
-
|
31
|
-
<div id="file-metadata">
|
32
|
-
<div id="file-list-section" class="section">
|
33
|
-
<h3 class="section-header">In Files</h3>
|
34
|
-
<div class="section-body">
|
35
|
-
<ul>
|
36
|
-
|
37
|
-
<li><a href="../lib/ankusa/naive_bayes_rb.html?TB_iframe=true&height=550&width=785"
|
38
|
-
class="thickbox" title="lib/ankusa/naive_bayes.rb">lib/ankusa/naive_bayes.rb</a></li>
|
39
|
-
|
40
|
-
</ul>
|
41
|
-
</div>
|
42
|
-
</div>
|
43
|
-
|
44
|
-
|
45
|
-
</div>
|
46
|
-
|
47
|
-
<div id="class-metadata">
|
48
|
-
|
49
|
-
<!-- Parent Class -->
|
50
|
-
<div id="parent-class-section" class="section">
|
51
|
-
<h3 class="section-header">Parent</h3>
|
52
|
-
|
53
|
-
<p class="link">Object</p>
|
54
|
-
|
55
|
-
</div>
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
<!-- Method Quickref -->
|
64
|
-
<div id="method-list-section" class="section">
|
65
|
-
<h3 class="section-header">Methods</h3>
|
66
|
-
<ul class="link-list">
|
67
|
-
|
68
|
-
<li><a href="#method-i-classifications">#classifications</a></li>
|
69
|
-
|
70
|
-
<li><a href="#method-i-classify">#classify</a></li>
|
71
|
-
|
72
|
-
<li><a href="#method-i-log_likelihoods">#log_likelihoods</a></li>
|
73
|
-
|
74
|
-
</ul>
|
75
|
-
</div>
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<!-- Included Modules -->
|
80
|
-
<div id="includes-section" class="section">
|
81
|
-
<h3 class="section-header">Included Modules</h3>
|
82
|
-
<ul class="link-list">
|
83
|
-
|
84
|
-
|
85
|
-
<li><a class="include" href="Classifier.html">Ankusa::Classifier</a></li>
|
86
|
-
|
87
|
-
|
88
|
-
</ul>
|
89
|
-
</div>
|
90
|
-
|
91
|
-
</div>
|
92
|
-
|
93
|
-
<div id="project-metadata">
|
94
|
-
|
95
|
-
|
96
|
-
<div id="fileindex-section" class="section project-section">
|
97
|
-
<h3 class="section-header">Files</h3>
|
98
|
-
<ul>
|
99
|
-
|
100
|
-
<li class="file"><a href="../README_rdoc.html">README.rdoc</a></li>
|
101
|
-
|
102
|
-
</ul>
|
103
|
-
</div>
|
104
|
-
|
105
|
-
|
106
|
-
<div id="classindex-section" class="section project-section">
|
107
|
-
<h3 class="section-header">Class/Module Index
|
108
|
-
<span class="search-toggle"><img src="../images/find.png"
|
109
|
-
height="16" width="16" alt="[+]"
|
110
|
-
title="show/hide quicksearch" /></span></h3>
|
111
|
-
<form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
|
112
|
-
<fieldset>
|
113
|
-
<legend>Quicksearch</legend>
|
114
|
-
<input type="text" name="quicksearch" value=""
|
115
|
-
class="quicksearch-field" />
|
116
|
-
</fieldset>
|
117
|
-
</form>
|
118
|
-
|
119
|
-
<ul class="link-list">
|
120
|
-
|
121
|
-
<li><a href="../Ankusa.html">Ankusa</a></li>
|
122
|
-
|
123
|
-
<li><a href="../Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
|
124
|
-
|
125
|
-
<li><a href="../Ankusa/Classifier.html">Ankusa::Classifier</a></li>
|
126
|
-
|
127
|
-
<li><a href="../Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
|
128
|
-
|
129
|
-
<li><a href="../Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
|
130
|
-
|
131
|
-
<li><a href="../Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
|
132
|
-
|
133
|
-
<li><a href="../Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
|
134
|
-
|
135
|
-
<li><a href="../Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
|
136
|
-
|
137
|
-
<li><a href="../Ankusa/TextHash.html">Ankusa::TextHash</a></li>
|
138
|
-
|
139
|
-
<li><a href="../String.html">String</a></li>
|
140
|
-
|
141
|
-
</ul>
|
142
|
-
<div id="no-class-search-results" style="display: none;">No matching classes.</div>
|
143
|
-
</div>
|
144
|
-
|
145
|
-
|
146
|
-
</div>
|
147
|
-
</div>
|
148
|
-
|
149
|
-
<div id="documentation">
|
150
|
-
<h1 class="class">Ankusa::NaiveBayesClassifier</h1>
|
151
|
-
|
152
|
-
<div id="description" class="description">
|
153
|
-
|
154
|
-
</div><!-- description -->
|
155
|
-
|
156
|
-
|
157
|
-
<div id="5Buntitled-5D" class="documentation-section">
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
<!-- Methods -->
|
167
|
-
|
168
|
-
<div id="public-instance-method-details" class="method-section section">
|
169
|
-
<h3 class="section-header">Public Instance Methods</h3>
|
170
|
-
|
171
|
-
|
172
|
-
<div id="classifications-method" class="method-detail ">
|
173
|
-
<a name="method-i-classifications"></a>
|
174
|
-
|
175
|
-
|
176
|
-
<div class="method-heading">
|
177
|
-
<span class="method-name">classifications</span><span
|
178
|
-
class="method-args">(text, classnames=nil)</span>
|
179
|
-
<span class="method-click-advice">click to toggle source</span>
|
180
|
-
</div>
|
181
|
-
|
182
|
-
|
183
|
-
<div class="method-description">
|
184
|
-
|
185
|
-
<p>Classes is an array of classes to look at</p>
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
<div class="method-source-code" id="classifications-source">
|
190
|
-
<pre>
|
191
|
-
<span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 13</span>
|
192
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword">nil</span>)
|
193
|
-
<span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
|
194
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
195
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = (<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">==</span> <span class="ruby-constant">INFTY</span>) <span class="ruby-operator">?</span> <span class="ruby-value">0</span> <span class="ruby-operator">:</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
|
196
|
-
}
|
197
|
-
|
198
|
-
<span class="ruby-comment"># normalize to get probs</span>
|
199
|
-
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
|
200
|
-
<span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
|
201
|
-
<span class="ruby-identifier">result</span>
|
202
|
-
<span class="ruby-keyword">end</span></pre>
|
203
|
-
</div><!-- classifications-source -->
|
204
|
-
|
205
|
-
</div>
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
</div><!-- classifications-method -->
|
211
|
-
|
212
|
-
|
213
|
-
<div id="classify-method" class="method-detail ">
|
214
|
-
<a name="method-i-classify"></a>
|
215
|
-
|
216
|
-
|
217
|
-
<div class="method-heading">
|
218
|
-
<span class="method-name">classify</span><span
|
219
|
-
class="method-args">(text, classes=nil)</span>
|
220
|
-
<span class="method-click-advice">click to toggle source</span>
|
221
|
-
</div>
|
222
|
-
|
223
|
-
|
224
|
-
<div class="method-description">
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
<div class="method-source-code" id="classify-source">
|
231
|
-
<pre>
|
232
|
-
<span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 7</span>
|
233
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword">nil</span>)
|
234
|
-
<span class="ruby-comment"># return the most probable class</span>
|
235
|
-
<span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
|
236
|
-
<span class="ruby-keyword">end</span></pre>
|
237
|
-
</div><!-- classify-source -->
|
238
|
-
|
239
|
-
</div>
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
</div><!-- classify-method -->
|
245
|
-
|
246
|
-
|
247
|
-
<div id="log_likelihoods-method" class="method-detail ">
|
248
|
-
<a name="method-i-log_likelihoods"></a>
|
249
|
-
|
250
|
-
|
251
|
-
<div class="method-heading">
|
252
|
-
<span class="method-name">log_likelihoods</span><span
|
253
|
-
class="method-args">(text, classnames=nil)</span>
|
254
|
-
<span class="method-click-advice">click to toggle source</span>
|
255
|
-
</div>
|
256
|
-
|
257
|
-
|
258
|
-
<div class="method-description">
|
259
|
-
|
260
|
-
<p>Classes is an array of classes to look at</p>
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
<div class="method-source-code" id="log_likelihoods-source">
|
265
|
-
<pre>
|
266
|
-
<span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 26</span>
|
267
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword">nil</span>)
|
268
|
-
<span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
|
269
|
-
<span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
|
270
|
-
|
271
|
-
<span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
|
272
|
-
<span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
|
273
|
-
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
274
|
-
<span class="ruby-comment"># log likelihood should be infinity if we've never seen the klass</span>
|
275
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">></span> <span class="ruby-value">0</span> <span class="ruby-operator">?</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) * <span class="ruby-identifier">count</span>) <span class="ruby-operator">:</span> <span class="ruby-constant">INFTY</span>
|
276
|
-
}
|
277
|
-
}
|
278
|
-
|
279
|
-
<span class="ruby-comment"># add the prior</span>
|
280
|
-
<span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
|
281
|
-
<span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
|
282
|
-
<span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
|
283
|
-
<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
|
284
|
-
}
|
285
|
-
|
286
|
-
<span class="ruby-identifier">result</span>
|
287
|
-
<span class="ruby-keyword">end</span></pre>
|
288
|
-
</div><!-- log_likelihoods-source -->
|
289
|
-
|
290
|
-
</div>
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
</div><!-- log_likelihoods-method -->
|
296
|
-
|
297
|
-
|
298
|
-
</div><!-- public-instance-method-details -->
|
299
|
-
|
300
|
-
</div><!-- 5Buntitled-5D -->
|
301
|
-
|
302
|
-
|
303
|
-
</div><!-- documentation -->
|
304
|
-
|
305
|
-
<div id="validator-badges">
|
306
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
307
|
-
<p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
|
308
|
-
Rdoc Generator</a> 2</small>.</p>
|
309
|
-
</div>
|
310
|
-
|
311
|
-
</body>
|
312
|
-
</html>
|
313
|
-
|
data/docs/Ankusa/TextHash.html
DELETED
@@ -1,390 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
3
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
4
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
|
-
<head>
|
6
|
-
<meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
|
7
|
-
|
8
|
-
<title>Class: Ankusa::TextHash</title>
|
9
|
-
|
10
|
-
<link rel="stylesheet" href="../rdoc.css" type="text/css" media="screen" />
|
11
|
-
|
12
|
-
<script src="../js/jquery.js" type="text/javascript" charset="utf-8"></script>
|
13
|
-
<script src="../js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
|
14
|
-
<script src="../js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
|
15
|
-
<script src="../js/darkfish.js" type="text/javascript" charset="utf-8"></script>
|
16
|
-
|
17
|
-
</head>
|
18
|
-
<body id="top" class="class">
|
19
|
-
|
20
|
-
<div id="metadata">
|
21
|
-
<div id="home-metadata">
|
22
|
-
<div id="home-section" class="section">
|
23
|
-
<h3 class="section-header">
|
24
|
-
<a href="../index.html">Home</a>
|
25
|
-
<a href="../index.html#classes">Classes</a>
|
26
|
-
<a href="../index.html#methods">Methods</a>
|
27
|
-
</h3>
|
28
|
-
</div>
|
29
|
-
</div>
|
30
|
-
|
31
|
-
<div id="file-metadata">
|
32
|
-
<div id="file-list-section" class="section">
|
33
|
-
<h3 class="section-header">In Files</h3>
|
34
|
-
<div class="section-body">
|
35
|
-
<ul>
|
36
|
-
|
37
|
-
<li><a href="../lib/ankusa/hasher_rb.html?TB_iframe=true&height=550&width=785"
|
38
|
-
class="thickbox" title="lib/ankusa/hasher.rb">lib/ankusa/hasher.rb</a></li>
|
39
|
-
|
40
|
-
</ul>
|
41
|
-
</div>
|
42
|
-
</div>
|
43
|
-
|
44
|
-
|
45
|
-
</div>
|
46
|
-
|
47
|
-
<div id="class-metadata">
|
48
|
-
|
49
|
-
<!-- Parent Class -->
|
50
|
-
<div id="parent-class-section" class="section">
|
51
|
-
<h3 class="section-header">Parent</h3>
|
52
|
-
|
53
|
-
<p class="link">Hash</p>
|
54
|
-
|
55
|
-
</div>
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
<!-- Method Quickref -->
|
64
|
-
<div id="method-list-section" class="section">
|
65
|
-
<h3 class="section-header">Methods</h3>
|
66
|
-
<ul class="link-list">
|
67
|
-
|
68
|
-
<li><a href="#method-c-atomize">::atomize</a></li>
|
69
|
-
|
70
|
-
<li><a href="#method-c-new">::new</a></li>
|
71
|
-
|
72
|
-
<li><a href="#method-c-valid_word-3F">::valid_word?</a></li>
|
73
|
-
|
74
|
-
<li><a href="#method-i-add_text">#add_text</a></li>
|
75
|
-
|
76
|
-
<li><a href="#method-i-add_word">#add_word</a></li>
|
77
|
-
|
78
|
-
</ul>
|
79
|
-
</div>
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
</div>
|
84
|
-
|
85
|
-
<div id="project-metadata">
|
86
|
-
|
87
|
-
|
88
|
-
<div id="fileindex-section" class="section project-section">
|
89
|
-
<h3 class="section-header">Files</h3>
|
90
|
-
<ul>
|
91
|
-
|
92
|
-
<li class="file"><a href="../README_rdoc.html">README.rdoc</a></li>
|
93
|
-
|
94
|
-
</ul>
|
95
|
-
</div>
|
96
|
-
|
97
|
-
|
98
|
-
<div id="classindex-section" class="section project-section">
|
99
|
-
<h3 class="section-header">Class/Module Index
|
100
|
-
<span class="search-toggle"><img src="../images/find.png"
|
101
|
-
height="16" width="16" alt="[+]"
|
102
|
-
title="show/hide quicksearch" /></span></h3>
|
103
|
-
<form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
|
104
|
-
<fieldset>
|
105
|
-
<legend>Quicksearch</legend>
|
106
|
-
<input type="text" name="quicksearch" value=""
|
107
|
-
class="quicksearch-field" />
|
108
|
-
</fieldset>
|
109
|
-
</form>
|
110
|
-
|
111
|
-
<ul class="link-list">
|
112
|
-
|
113
|
-
<li><a href="../Ankusa.html">Ankusa</a></li>
|
114
|
-
|
115
|
-
<li><a href="../Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
|
116
|
-
|
117
|
-
<li><a href="../Ankusa/Classifier.html">Ankusa::Classifier</a></li>
|
118
|
-
|
119
|
-
<li><a href="../Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
|
120
|
-
|
121
|
-
<li><a href="../Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
|
122
|
-
|
123
|
-
<li><a href="../Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
|
124
|
-
|
125
|
-
<li><a href="../Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
|
126
|
-
|
127
|
-
<li><a href="../Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
|
128
|
-
|
129
|
-
<li><a href="../Ankusa/TextHash.html">Ankusa::TextHash</a></li>
|
130
|
-
|
131
|
-
<li><a href="../String.html">String</a></li>
|
132
|
-
|
133
|
-
</ul>
|
134
|
-
<div id="no-class-search-results" style="display: none;">No matching classes.</div>
|
135
|
-
</div>
|
136
|
-
|
137
|
-
|
138
|
-
</div>
|
139
|
-
</div>
|
140
|
-
|
141
|
-
<div id="documentation">
|
142
|
-
<h1 class="class">Ankusa::TextHash</h1>
|
143
|
-
|
144
|
-
<div id="description" class="description">
|
145
|
-
|
146
|
-
</div><!-- description -->
|
147
|
-
|
148
|
-
|
149
|
-
<div id="5Buntitled-5D" class="documentation-section">
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
<!-- Attributes -->
|
158
|
-
<div id="attribute-method-details" class="method-section section">
|
159
|
-
<h3 class="section-header">Attributes</h3>
|
160
|
-
|
161
|
-
|
162
|
-
<div id="word_count-attribute-method" class="method-detail">
|
163
|
-
<a name="word_count"></a>
|
164
|
-
|
165
|
-
<div class="method-heading attribute-method-heading">
|
166
|
-
<span class="method-name">word_count</span><span
|
167
|
-
class="attribute-access-type">[R]</span>
|
168
|
-
</div>
|
169
|
-
|
170
|
-
<div class="method-description">
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
</div>
|
175
|
-
</div>
|
176
|
-
|
177
|
-
</div><!-- attribute-method-details -->
|
178
|
-
|
179
|
-
|
180
|
-
<!-- Methods -->
|
181
|
-
|
182
|
-
<div id="public-class-method-details" class="method-section section">
|
183
|
-
<h3 class="section-header">Public Class Methods</h3>
|
184
|
-
|
185
|
-
|
186
|
-
<div id="atomize-method" class="method-detail ">
|
187
|
-
<a name="method-c-atomize"></a>
|
188
|
-
|
189
|
-
|
190
|
-
<div class="method-heading">
|
191
|
-
<span class="method-name">atomize</span><span
|
192
|
-
class="method-args">(text)</span>
|
193
|
-
<span class="method-click-advice">click to toggle source</span>
|
194
|
-
</div>
|
195
|
-
|
196
|
-
|
197
|
-
<div class="method-description">
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
<div class="method-source-code" id="atomize-source">
|
204
|
-
<pre>
|
205
|
-
<span class="ruby-comment"># File lib/ankusa/hasher.rb, line 15</span>
|
206
|
-
<span class="ruby-keyword">def</span> <span class="ruby-keyword">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
|
207
|
-
<span class="ruby-identifier">text</span>.<span class="ruby-identifier">downcase</span>.<span class="ruby-identifier">to_ascii</span>.<span class="ruby-identifier">tr</span>(<span class="ruby-string">'-'</span>, <span class="ruby-string">' '</span>).<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp">/[^\w\s]/</span>,<span class="ruby-string">" "</span>).<span class="ruby-identifier">split</span>
|
208
|
-
<span class="ruby-keyword">end</span></pre>
|
209
|
-
</div><!-- atomize-source -->
|
210
|
-
|
211
|
-
</div>
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
</div><!-- atomize-method -->
|
217
|
-
|
218
|
-
|
219
|
-
<div id="new-method" class="method-detail ">
|
220
|
-
<a name="method-c-new"></a>
|
221
|
-
|
222
|
-
|
223
|
-
<div class="method-heading">
|
224
|
-
<span class="method-name">new</span><span
|
225
|
-
class="method-args">(text=nil)</span>
|
226
|
-
<span class="method-click-advice">click to toggle source</span>
|
227
|
-
</div>
|
228
|
-
|
229
|
-
|
230
|
-
<div class="method-description">
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
<div class="method-source-code" id="new-source">
|
237
|
-
<pre>
|
238
|
-
<span class="ruby-comment"># File lib/ankusa/hasher.rb, line 9</span>
|
239
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword">nil</span>)
|
240
|
-
<span class="ruby-keyword">super</span> <span class="ruby-value">0</span>
|
241
|
-
<span class="ruby-ivar">@word_count</span> = <span class="ruby-value">0</span>
|
242
|
-
<span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>) <span class="ruby-keyword">unless</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">nil?</span>
|
243
|
-
<span class="ruby-keyword">end</span></pre>
|
244
|
-
</div><!-- new-source -->
|
245
|
-
|
246
|
-
</div>
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
</div><!-- new-method -->
|
252
|
-
|
253
|
-
|
254
|
-
<div id="valid_word-3F-method" class="method-detail ">
|
255
|
-
<a name="method-c-valid_word-3F"></a>
|
256
|
-
|
257
|
-
|
258
|
-
<div class="method-heading">
|
259
|
-
<span class="method-name">valid_word?</span><span
|
260
|
-
class="method-args">(word)</span>
|
261
|
-
<span class="method-click-advice">click to toggle source</span>
|
262
|
-
</div>
|
263
|
-
|
264
|
-
|
265
|
-
<div class="method-description">
|
266
|
-
|
267
|
-
<p>word should be only alphanum chars at this point</p>
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
<div class="method-source-code" id="valid_word-3F-source">
|
272
|
-
<pre>
|
273
|
-
<span class="ruby-comment"># File lib/ankusa/hasher.rb, line 20</span>
|
274
|
-
<span class="ruby-keyword">def</span> <span class="ruby-keyword">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
|
275
|
-
<span class="ruby-keyword">return</span> <span class="ruby-keyword">true</span> <span class="ruby-keyword">unless</span> <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">STOPWORDS</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">word</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator"><</span> <span class="ruby-value">3</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">numeric?</span>
|
276
|
-
<span class="ruby-keyword">end</span></pre>
|
277
|
-
</div><!-- valid_word-3F-source -->
|
278
|
-
|
279
|
-
</div>
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
</div><!-- valid_word-3F-method -->
|
285
|
-
|
286
|
-
|
287
|
-
</div><!-- public-class-method-details -->
|
288
|
-
|
289
|
-
<div id="public-instance-method-details" class="method-section section">
|
290
|
-
<h3 class="section-header">Public Instance Methods</h3>
|
291
|
-
|
292
|
-
|
293
|
-
<div id="add_text-method" class="method-detail ">
|
294
|
-
<a name="method-i-add_text"></a>
|
295
|
-
|
296
|
-
|
297
|
-
<div class="method-heading">
|
298
|
-
<span class="method-name">add_text</span><span
|
299
|
-
class="method-args">(text)</span>
|
300
|
-
<span class="method-click-advice">click to toggle source</span>
|
301
|
-
</div>
|
302
|
-
|
303
|
-
|
304
|
-
<div class="method-description">
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
<div class="method-source-code" id="add_text-source">
|
311
|
-
<pre>
|
312
|
-
<span class="ruby-comment"># File lib/ankusa/hasher.rb, line 24</span>
|
313
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
|
314
|
-
<span class="ruby-keyword">if</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">instance_of?</span> <span class="ruby-constant">Array</span>
|
315
|
-
<span class="ruby-identifier">text</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_text</span> <span class="ruby-identifier">t</span> }
|
316
|
-
<span class="ruby-keyword">else</span>
|
317
|
-
<span class="ruby-comment"># replace dashes with spaces, then get rid of non-word/non-space characters, </span>
|
318
|
-
<span class="ruby-comment"># then split by space to get words</span>
|
319
|
-
<span class="ruby-identifier">words</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">atomize</span> <span class="ruby-identifier">text</span>
|
320
|
-
<span class="ruby-identifier">words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>) <span class="ruby-keyword">if</span> <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>) }
|
321
|
-
<span class="ruby-keyword">end</span>
|
322
|
-
<span class="ruby-keyword">self</span>
|
323
|
-
<span class="ruby-keyword">end</span></pre>
|
324
|
-
</div><!-- add_text-source -->
|
325
|
-
|
326
|
-
</div>
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
</div><!-- add_text-method -->
|
332
|
-
|
333
|
-
|
334
|
-
</div><!-- public-instance-method-details -->
|
335
|
-
|
336
|
-
<div id="protected-instance-method-details" class="method-section section">
|
337
|
-
<h3 class="section-header">Protected Instance Methods</h3>
|
338
|
-
|
339
|
-
|
340
|
-
<div id="add_word-method" class="method-detail ">
|
341
|
-
<a name="method-i-add_word"></a>
|
342
|
-
|
343
|
-
|
344
|
-
<div class="method-heading">
|
345
|
-
<span class="method-name">add_word</span><span
|
346
|
-
class="method-args">(word)</span>
|
347
|
-
<span class="method-click-advice">click to toggle source</span>
|
348
|
-
</div>
|
349
|
-
|
350
|
-
|
351
|
-
<div class="method-description">
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
<div class="method-source-code" id="add_word-source">
|
358
|
-
<pre>
|
359
|
-
<span class="ruby-comment"># File lib/ankusa/hasher.rb, line 38</span>
|
360
|
-
<span class="ruby-keyword">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
|
361
|
-
<span class="ruby-ivar">@word_count</span> <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
|
362
|
-
<span class="ruby-identifier">key</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
|
363
|
-
<span class="ruby-identifier">store</span> <span class="ruby-identifier">key</span>, <span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">key</span>, <span class="ruby-value">0</span>)<span class="ruby-operator">+</span><span class="ruby-value">1</span>
|
364
|
-
<span class="ruby-keyword">end</span></pre>
|
365
|
-
</div><!-- add_word-source -->
|
366
|
-
|
367
|
-
</div>
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
</div><!-- add_word-method -->
|
373
|
-
|
374
|
-
|
375
|
-
</div><!-- protected-instance-method-details -->
|
376
|
-
|
377
|
-
</div><!-- 5Buntitled-5D -->
|
378
|
-
|
379
|
-
|
380
|
-
</div><!-- documentation -->
|
381
|
-
|
382
|
-
<div id="validator-badges">
|
383
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
384
|
-
<p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
|
385
|
-
Rdoc Generator</a> 2</small>.</p>
|
386
|
-
</div>
|
387
|
-
|
388
|
-
</body>
|
389
|
-
</html>
|
390
|
-
|