ankusa 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. data/Rakefile +3 -3
  2. data/docs/Ankusa.html +229 -0
  3. data/docs/Ankusa/CassandraStorage.html +801 -0
  4. data/docs/Ankusa/Classifier.html +440 -0
  5. data/docs/Ankusa/FileSystemStorage.html +376 -0
  6. data/docs/Ankusa/HBaseStorage.html +845 -0
  7. data/docs/Ankusa/KLDivergenceClassifier.html +265 -0
  8. data/docs/Ankusa/MemoryStorage.html +672 -0
  9. data/docs/Ankusa/NaiveBayesClassifier.html +313 -0
  10. data/docs/Ankusa/TextHash.html +390 -0
  11. data/docs/README_rdoc.html +268 -0
  12. data/docs/String.html +241 -0
  13. data/docs/created.rid +14 -0
  14. data/docs/images/brick.png +0 -0
  15. data/docs/images/brick_link.png +0 -0
  16. data/docs/images/bug.png +0 -0
  17. data/docs/images/bullet_black.png +0 -0
  18. data/docs/images/bullet_toggle_minus.png +0 -0
  19. data/docs/images/bullet_toggle_plus.png +0 -0
  20. data/docs/images/date.png +0 -0
  21. data/docs/images/find.png +0 -0
  22. data/docs/images/loadingAnimation.gif +0 -0
  23. data/docs/images/macFFBgHack.png +0 -0
  24. data/docs/images/package.png +0 -0
  25. data/docs/images/page_green.png +0 -0
  26. data/docs/images/page_white_text.png +0 -0
  27. data/docs/images/page_white_width.png +0 -0
  28. data/docs/images/plugin.png +0 -0
  29. data/docs/images/ruby.png +0 -0
  30. data/docs/images/tag_green.png +0 -0
  31. data/docs/images/wrench.png +0 -0
  32. data/docs/images/wrench_orange.png +0 -0
  33. data/docs/images/zoom.png +0 -0
  34. data/docs/index.html +212 -0
  35. data/docs/js/darkfish.js +116 -0
  36. data/docs/js/jquery.js +32 -0
  37. data/docs/js/quicksearch.js +114 -0
  38. data/docs/js/thickbox-compressed.js +10 -0
  39. data/docs/lib/ankusa/cassandra_storage_rb.html +54 -0
  40. data/docs/lib/ankusa/classifier_rb.html +52 -0
  41. data/docs/lib/ankusa/extensions_rb.html +54 -0
  42. data/docs/lib/ankusa/file_system_storage_rb.html +54 -0
  43. data/docs/lib/ankusa/hasher_rb.html +56 -0
  44. data/docs/lib/ankusa/hbase_storage_rb.html +54 -0
  45. data/docs/lib/ankusa/kl_divergence_rb.html +52 -0
  46. data/docs/lib/ankusa/memory_storage_rb.html +52 -0
  47. data/docs/lib/ankusa/naive_bayes_rb.html +52 -0
  48. data/docs/lib/ankusa/stopwords_rb.html +52 -0
  49. data/docs/lib/ankusa/version_rb.html +52 -0
  50. data/docs/lib/ankusa_rb.html +64 -0
  51. data/docs/rdoc.css +759 -0
  52. data/lib/ankusa/cassandra_storage.rb +2 -2
  53. data/lib/ankusa/classifier.rb +2 -2
  54. data/lib/ankusa/hasher.rb +17 -17
  55. data/lib/ankusa/hbase_storage.rb +2 -2
  56. data/lib/ankusa/stopwords.rb +1 -1
  57. data/lib/ankusa/version.rb +1 -1
  58. metadata +56 -8
@@ -0,0 +1,313 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
7
+
8
+ <title>Class: Ankusa::NaiveBayesClassifier</title>
9
+
10
+ <link rel="stylesheet" href="../rdoc.css" type="text/css" media="screen" />
11
+
12
+ <script src="../js/jquery.js" type="text/javascript" charset="utf-8"></script>
13
+ <script src="../js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
14
+ <script src="../js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
15
+ <script src="../js/darkfish.js" type="text/javascript" charset="utf-8"></script>
16
+
17
+ </head>
18
+ <body id="top" class="class">
19
+
20
+ <div id="metadata">
21
+ <div id="home-metadata">
22
+ <div id="home-section" class="section">
23
+ <h3 class="section-header">
24
+ <a href="../index.html">Home</a>
25
+ <a href="../index.html#classes">Classes</a>
26
+ <a href="../index.html#methods">Methods</a>
27
+ </h3>
28
+ </div>
29
+ </div>
30
+
31
+ <div id="file-metadata">
32
+ <div id="file-list-section" class="section">
33
+ <h3 class="section-header">In Files</h3>
34
+ <div class="section-body">
35
+ <ul>
36
+
37
+ <li><a href="../lib/ankusa/naive_bayes_rb.html?TB_iframe=true&amp;height=550&amp;width=785"
38
+ class="thickbox" title="lib/ankusa/naive_bayes.rb">lib/ankusa/naive_bayes.rb</a></li>
39
+
40
+ </ul>
41
+ </div>
42
+ </div>
43
+
44
+
45
+ </div>
46
+
47
+ <div id="class-metadata">
48
+
49
+ <!-- Parent Class -->
50
+ <div id="parent-class-section" class="section">
51
+ <h3 class="section-header">Parent</h3>
52
+
53
+ <p class="link">Object</p>
54
+
55
+ </div>
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ <!-- Method Quickref -->
64
+ <div id="method-list-section" class="section">
65
+ <h3 class="section-header">Methods</h3>
66
+ <ul class="link-list">
67
+
68
+ <li><a href="#method-i-classifications">#classifications</a></li>
69
+
70
+ <li><a href="#method-i-classify">#classify</a></li>
71
+
72
+ <li><a href="#method-i-log_likelihoods">#log_likelihoods</a></li>
73
+
74
+ </ul>
75
+ </div>
76
+
77
+
78
+
79
+ <!-- Included Modules -->
80
+ <div id="includes-section" class="section">
81
+ <h3 class="section-header">Included Modules</h3>
82
+ <ul class="link-list">
83
+
84
+
85
+ <li><a class="include" href="Classifier.html">Ankusa::Classifier</a></li>
86
+
87
+
88
+ </ul>
89
+ </div>
90
+
91
+ </div>
92
+
93
+ <div id="project-metadata">
94
+
95
+
96
+ <div id="fileindex-section" class="section project-section">
97
+ <h3 class="section-header">Files</h3>
98
+ <ul>
99
+
100
+ <li class="file"><a href="../README_rdoc.html">README.rdoc</a></li>
101
+
102
+ </ul>
103
+ </div>
104
+
105
+
106
+ <div id="classindex-section" class="section project-section">
107
+ <h3 class="section-header">Class/Module Index
108
+ <span class="search-toggle"><img src="../images/find.png"
109
+ height="16" width="16" alt="[+]"
110
+ title="show/hide quicksearch" /></span></h3>
111
+ <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
112
+ <fieldset>
113
+ <legend>Quicksearch</legend>
114
+ <input type="text" name="quicksearch" value=""
115
+ class="quicksearch-field" />
116
+ </fieldset>
117
+ </form>
118
+
119
+ <ul class="link-list">
120
+
121
+ <li><a href="../Ankusa.html">Ankusa</a></li>
122
+
123
+ <li><a href="../Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
124
+
125
+ <li><a href="../Ankusa/Classifier.html">Ankusa::Classifier</a></li>
126
+
127
+ <li><a href="../Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
128
+
129
+ <li><a href="../Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
130
+
131
+ <li><a href="../Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
132
+
133
+ <li><a href="../Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
134
+
135
+ <li><a href="../Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
136
+
137
+ <li><a href="../Ankusa/TextHash.html">Ankusa::TextHash</a></li>
138
+
139
+ <li><a href="../String.html">String</a></li>
140
+
141
+ </ul>
142
+ <div id="no-class-search-results" style="display: none;">No matching classes.</div>
143
+ </div>
144
+
145
+
146
+ </div>
147
+ </div>
148
+
149
+ <div id="documentation">
150
+ <h1 class="class">Ankusa::NaiveBayesClassifier</h1>
151
+
152
+ <div id="description" class="description">
153
+
154
+ </div><!-- description -->
155
+
156
+
157
+ <div id="5Buntitled-5D" class="documentation-section">
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+ <!-- Methods -->
167
+
168
+ <div id="public-instance-method-details" class="method-section section">
169
+ <h3 class="section-header">Public Instance Methods</h3>
170
+
171
+
172
+ <div id="classifications-method" class="method-detail ">
173
+ <a name="method-i-classifications"></a>
174
+
175
+
176
+ <div class="method-heading">
177
+ <span class="method-name">classifications</span><span
178
+ class="method-args">(text, classnames=nil)</span>
179
+ <span class="method-click-advice">click to toggle source</span>
180
+ </div>
181
+
182
+
183
+ <div class="method-description">
184
+
185
+ <p>Classes is an array of classes to look at</p>
186
+
187
+
188
+
189
+ <div class="method-source-code" id="classifications-source">
190
+ <pre>
191
+ <span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 13</span>
192
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword">nil</span>)
193
+ <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
194
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
195
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = (<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">==</span> <span class="ruby-constant">INFTY</span>) <span class="ruby-operator">?</span> <span class="ruby-value">0</span> <span class="ruby-operator">:</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
196
+ }
197
+
198
+ <span class="ruby-comment"># normalize to get probs</span>
199
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
200
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
201
+ <span class="ruby-identifier">result</span>
202
+ <span class="ruby-keyword">end</span></pre>
203
+ </div><!-- classifications-source -->
204
+
205
+ </div>
206
+
207
+
208
+
209
+
210
+ </div><!-- classifications-method -->
211
+
212
+
213
+ <div id="classify-method" class="method-detail ">
214
+ <a name="method-i-classify"></a>
215
+
216
+
217
+ <div class="method-heading">
218
+ <span class="method-name">classify</span><span
219
+ class="method-args">(text, classes=nil)</span>
220
+ <span class="method-click-advice">click to toggle source</span>
221
+ </div>
222
+
223
+
224
+ <div class="method-description">
225
+
226
+
227
+
228
+
229
+
230
+ <div class="method-source-code" id="classify-source">
231
+ <pre>
232
+ <span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 7</span>
233
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword">nil</span>)
234
+ <span class="ruby-comment"># return the most probable class</span>
235
+ <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
236
+ <span class="ruby-keyword">end</span></pre>
237
+ </div><!-- classify-source -->
238
+
239
+ </div>
240
+
241
+
242
+
243
+
244
+ </div><!-- classify-method -->
245
+
246
+
247
+ <div id="log_likelihoods-method" class="method-detail ">
248
+ <a name="method-i-log_likelihoods"></a>
249
+
250
+
251
+ <div class="method-heading">
252
+ <span class="method-name">log_likelihoods</span><span
253
+ class="method-args">(text, classnames=nil)</span>
254
+ <span class="method-click-advice">click to toggle source</span>
255
+ </div>
256
+
257
+
258
+ <div class="method-description">
259
+
260
+ <p>Classes is an array of classes to look at</p>
261
+
262
+
263
+
264
+ <div class="method-source-code" id="log_likelihoods-source">
265
+ <pre>
266
+ <span class="ruby-comment"># File lib/ankusa/naive_bayes.rb, line 26</span>
267
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword">nil</span>)
268
+ <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
269
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
270
+
271
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
272
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
273
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
274
+ <span class="ruby-comment"># log likelihood should be infinity if we've never seen the klass</span>
275
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">&gt;</span> <span class="ruby-value">0</span> <span class="ruby-operator">?</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) * <span class="ruby-identifier">count</span>) <span class="ruby-operator">:</span> <span class="ruby-constant">INFTY</span>
276
+ }
277
+ }
278
+
279
+ <span class="ruby-comment"># add the prior</span>
280
+ <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
281
+ <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
282
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
283
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
284
+ }
285
+
286
+ <span class="ruby-identifier">result</span>
287
+ <span class="ruby-keyword">end</span></pre>
288
+ </div><!-- log_likelihoods-source -->
289
+
290
+ </div>
291
+
292
+
293
+
294
+
295
+ </div><!-- log_likelihoods-method -->
296
+
297
+
298
+ </div><!-- public-instance-method-details -->
299
+
300
+ </div><!-- 5Buntitled-5D -->
301
+
302
+
303
+ </div><!-- documentation -->
304
+
305
+ <div id="validator-badges">
306
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
307
+ <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
308
+ Rdoc Generator</a> 2</small>.</p>
309
+ </div>
310
+
311
+ </body>
312
+ </html>
313
+
@@ -0,0 +1,390 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
7
+
8
+ <title>Class: Ankusa::TextHash</title>
9
+
10
+ <link rel="stylesheet" href="../rdoc.css" type="text/css" media="screen" />
11
+
12
+ <script src="../js/jquery.js" type="text/javascript" charset="utf-8"></script>
13
+ <script src="../js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
14
+ <script src="../js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
15
+ <script src="../js/darkfish.js" type="text/javascript" charset="utf-8"></script>
16
+
17
+ </head>
18
+ <body id="top" class="class">
19
+
20
+ <div id="metadata">
21
+ <div id="home-metadata">
22
+ <div id="home-section" class="section">
23
+ <h3 class="section-header">
24
+ <a href="../index.html">Home</a>
25
+ <a href="../index.html#classes">Classes</a>
26
+ <a href="../index.html#methods">Methods</a>
27
+ </h3>
28
+ </div>
29
+ </div>
30
+
31
+ <div id="file-metadata">
32
+ <div id="file-list-section" class="section">
33
+ <h3 class="section-header">In Files</h3>
34
+ <div class="section-body">
35
+ <ul>
36
+
37
+ <li><a href="../lib/ankusa/hasher_rb.html?TB_iframe=true&amp;height=550&amp;width=785"
38
+ class="thickbox" title="lib/ankusa/hasher.rb">lib/ankusa/hasher.rb</a></li>
39
+
40
+ </ul>
41
+ </div>
42
+ </div>
43
+
44
+
45
+ </div>
46
+
47
+ <div id="class-metadata">
48
+
49
+ <!-- Parent Class -->
50
+ <div id="parent-class-section" class="section">
51
+ <h3 class="section-header">Parent</h3>
52
+
53
+ <p class="link">Hash</p>
54
+
55
+ </div>
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ <!-- Method Quickref -->
64
+ <div id="method-list-section" class="section">
65
+ <h3 class="section-header">Methods</h3>
66
+ <ul class="link-list">
67
+
68
+ <li><a href="#method-c-atomize">::atomize</a></li>
69
+
70
+ <li><a href="#method-c-new">::new</a></li>
71
+
72
+ <li><a href="#method-c-valid_word-3F">::valid_word?</a></li>
73
+
74
+ <li><a href="#method-i-add_text">#add_text</a></li>
75
+
76
+ <li><a href="#method-i-add_word">#add_word</a></li>
77
+
78
+ </ul>
79
+ </div>
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="project-metadata">
86
+
87
+
88
+ <div id="fileindex-section" class="section project-section">
89
+ <h3 class="section-header">Files</h3>
90
+ <ul>
91
+
92
+ <li class="file"><a href="../README_rdoc.html">README.rdoc</a></li>
93
+
94
+ </ul>
95
+ </div>
96
+
97
+
98
+ <div id="classindex-section" class="section project-section">
99
+ <h3 class="section-header">Class/Module Index
100
+ <span class="search-toggle"><img src="../images/find.png"
101
+ height="16" width="16" alt="[+]"
102
+ title="show/hide quicksearch" /></span></h3>
103
+ <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
104
+ <fieldset>
105
+ <legend>Quicksearch</legend>
106
+ <input type="text" name="quicksearch" value=""
107
+ class="quicksearch-field" />
108
+ </fieldset>
109
+ </form>
110
+
111
+ <ul class="link-list">
112
+
113
+ <li><a href="../Ankusa.html">Ankusa</a></li>
114
+
115
+ <li><a href="../Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
116
+
117
+ <li><a href="../Ankusa/Classifier.html">Ankusa::Classifier</a></li>
118
+
119
+ <li><a href="../Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
120
+
121
+ <li><a href="../Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
122
+
123
+ <li><a href="../Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
124
+
125
+ <li><a href="../Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
126
+
127
+ <li><a href="../Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
128
+
129
+ <li><a href="../Ankusa/TextHash.html">Ankusa::TextHash</a></li>
130
+
131
+ <li><a href="../String.html">String</a></li>
132
+
133
+ </ul>
134
+ <div id="no-class-search-results" style="display: none;">No matching classes.</div>
135
+ </div>
136
+
137
+
138
+ </div>
139
+ </div>
140
+
141
+ <div id="documentation">
142
+ <h1 class="class">Ankusa::TextHash</h1>
143
+
144
+ <div id="description" class="description">
145
+
146
+ </div><!-- description -->
147
+
148
+
149
+ <div id="5Buntitled-5D" class="documentation-section">
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+ <!-- Attributes -->
158
+ <div id="attribute-method-details" class="method-section section">
159
+ <h3 class="section-header">Attributes</h3>
160
+
161
+
162
+ <div id="word_count-attribute-method" class="method-detail">
163
+ <a name="word_count"></a>
164
+
165
+ <div class="method-heading attribute-method-heading">
166
+ <span class="method-name">word_count</span><span
167
+ class="attribute-access-type">[R]</span>
168
+ </div>
169
+
170
+ <div class="method-description">
171
+
172
+
173
+
174
+ </div>
175
+ </div>
176
+
177
+ </div><!-- attribute-method-details -->
178
+
179
+
180
+ <!-- Methods -->
181
+
182
+ <div id="public-class-method-details" class="method-section section">
183
+ <h3 class="section-header">Public Class Methods</h3>
184
+
185
+
186
+ <div id="atomize-method" class="method-detail ">
187
+ <a name="method-c-atomize"></a>
188
+
189
+
190
+ <div class="method-heading">
191
+ <span class="method-name">atomize</span><span
192
+ class="method-args">(text)</span>
193
+ <span class="method-click-advice">click to toggle source</span>
194
+ </div>
195
+
196
+
197
+ <div class="method-description">
198
+
199
+
200
+
201
+
202
+
203
+ <div class="method-source-code" id="atomize-source">
204
+ <pre>
205
+ <span class="ruby-comment"># File lib/ankusa/hasher.rb, line 15</span>
206
+ <span class="ruby-keyword">def</span> <span class="ruby-keyword">self</span>.<span class="ruby-identifier">atomize</span>(<span class="ruby-identifier">text</span>)
207
+ <span class="ruby-identifier">text</span>.<span class="ruby-identifier">downcase</span>.<span class="ruby-identifier">to_ascii</span>.<span class="ruby-identifier">tr</span>(<span class="ruby-string">'-'</span>, <span class="ruby-string">' '</span>).<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp">/[^\w\s]/</span>,<span class="ruby-string">&quot; &quot;</span>).<span class="ruby-identifier">split</span>
208
+ <span class="ruby-keyword">end</span></pre>
209
+ </div><!-- atomize-source -->
210
+
211
+ </div>
212
+
213
+
214
+
215
+
216
+ </div><!-- atomize-method -->
217
+
218
+
219
+ <div id="new-method" class="method-detail ">
220
+ <a name="method-c-new"></a>
221
+
222
+
223
+ <div class="method-heading">
224
+ <span class="method-name">new</span><span
225
+ class="method-args">(text=nil)</span>
226
+ <span class="method-click-advice">click to toggle source</span>
227
+ </div>
228
+
229
+
230
+ <div class="method-description">
231
+
232
+
233
+
234
+
235
+
236
+ <div class="method-source-code" id="new-source">
237
+ <pre>
238
+ <span class="ruby-comment"># File lib/ankusa/hasher.rb, line 9</span>
239
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">text</span>=<span class="ruby-keyword">nil</span>)
240
+ <span class="ruby-keyword">super</span> <span class="ruby-value">0</span>
241
+ <span class="ruby-ivar">@word_count</span> = <span class="ruby-value">0</span>
242
+ <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>) <span class="ruby-keyword">unless</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">nil?</span>
243
+ <span class="ruby-keyword">end</span></pre>
244
+ </div><!-- new-source -->
245
+
246
+ </div>
247
+
248
+
249
+
250
+
251
+ </div><!-- new-method -->
252
+
253
+
254
+ <div id="valid_word-3F-method" class="method-detail ">
255
+ <a name="method-c-valid_word-3F"></a>
256
+
257
+
258
+ <div class="method-heading">
259
+ <span class="method-name">valid_word?</span><span
260
+ class="method-args">(word)</span>
261
+ <span class="method-click-advice">click to toggle source</span>
262
+ </div>
263
+
264
+
265
+ <div class="method-description">
266
+
267
+ <p>word should be only alphanum chars at this point</p>
268
+
269
+
270
+
271
+ <div class="method-source-code" id="valid_word-3F-source">
272
+ <pre>
273
+ <span class="ruby-comment"># File lib/ankusa/hasher.rb, line 20</span>
274
+ <span class="ruby-keyword">def</span> <span class="ruby-keyword">self</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>)
275
+ <span class="ruby-keyword">return</span> <span class="ruby-keyword">true</span> <span class="ruby-keyword">unless</span> <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">STOPWORDS</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">word</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">3</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">numeric?</span>
276
+ <span class="ruby-keyword">end</span></pre>
277
+ </div><!-- valid_word-3F-source -->
278
+
279
+ </div>
280
+
281
+
282
+
283
+
284
+ </div><!-- valid_word-3F-method -->
285
+
286
+
287
+ </div><!-- public-class-method-details -->
288
+
289
+ <div id="public-instance-method-details" class="method-section section">
290
+ <h3 class="section-header">Public Instance Methods</h3>
291
+
292
+
293
+ <div id="add_text-method" class="method-detail ">
294
+ <a name="method-i-add_text"></a>
295
+
296
+
297
+ <div class="method-heading">
298
+ <span class="method-name">add_text</span><span
299
+ class="method-args">(text)</span>
300
+ <span class="method-click-advice">click to toggle source</span>
301
+ </div>
302
+
303
+
304
+ <div class="method-description">
305
+
306
+
307
+
308
+
309
+
310
+ <div class="method-source-code" id="add_text-source">
311
+ <pre>
312
+ <span class="ruby-comment"># File lib/ankusa/hasher.rb, line 24</span>
313
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">add_text</span>(<span class="ruby-identifier">text</span>)
314
+ <span class="ruby-keyword">if</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">instance_of?</span> <span class="ruby-constant">Array</span>
315
+ <span class="ruby-identifier">text</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_text</span> <span class="ruby-identifier">t</span> }
316
+ <span class="ruby-keyword">else</span>
317
+ <span class="ruby-comment"># replace dashes with spaces, then get rid of non-word/non-space characters, </span>
318
+ <span class="ruby-comment"># then split by space to get words</span>
319
+ <span class="ruby-identifier">words</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">atomize</span> <span class="ruby-identifier">text</span>
320
+ <span class="ruby-identifier">words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>) <span class="ruby-keyword">if</span> <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">valid_word?</span>(<span class="ruby-identifier">word</span>) }
321
+ <span class="ruby-keyword">end</span>
322
+ <span class="ruby-keyword">self</span>
323
+ <span class="ruby-keyword">end</span></pre>
324
+ </div><!-- add_text-source -->
325
+
326
+ </div>
327
+
328
+
329
+
330
+
331
+ </div><!-- add_text-method -->
332
+
333
+
334
+ </div><!-- public-instance-method-details -->
335
+
336
+ <div id="protected-instance-method-details" class="method-section section">
337
+ <h3 class="section-header">Protected Instance Methods</h3>
338
+
339
+
340
+ <div id="add_word-method" class="method-detail ">
341
+ <a name="method-i-add_word"></a>
342
+
343
+
344
+ <div class="method-heading">
345
+ <span class="method-name">add_word</span><span
346
+ class="method-args">(word)</span>
347
+ <span class="method-click-advice">click to toggle source</span>
348
+ </div>
349
+
350
+
351
+ <div class="method-description">
352
+
353
+
354
+
355
+
356
+
357
+ <div class="method-source-code" id="add_word-source">
358
+ <pre>
359
+ <span class="ruby-comment"># File lib/ankusa/hasher.rb, line 38</span>
360
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">add_word</span>(<span class="ruby-identifier">word</span>)
361
+ <span class="ruby-ivar">@word_count</span> <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
362
+ <span class="ruby-identifier">key</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
363
+ <span class="ruby-identifier">store</span> <span class="ruby-identifier">key</span>, <span class="ruby-identifier">fetch</span>(<span class="ruby-identifier">key</span>, <span class="ruby-value">0</span>)<span class="ruby-operator">+</span><span class="ruby-value">1</span>
364
+ <span class="ruby-keyword">end</span></pre>
365
+ </div><!-- add_word-source -->
366
+
367
+ </div>
368
+
369
+
370
+
371
+
372
+ </div><!-- add_word-method -->
373
+
374
+
375
+ </div><!-- protected-instance-method-details -->
376
+
377
+ </div><!-- 5Buntitled-5D -->
378
+
379
+
380
+ </div><!-- documentation -->
381
+
382
+ <div id="validator-badges">
383
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
384
+ <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
385
+ Rdoc Generator</a> 2</small>.</p>
386
+ </div>
387
+
388
+ </body>
389
+ </html>
390
+