ankusa 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. data/Rakefile +3 -3
  2. data/docs/Ankusa.html +229 -0
  3. data/docs/Ankusa/CassandraStorage.html +801 -0
  4. data/docs/Ankusa/Classifier.html +440 -0
  5. data/docs/Ankusa/FileSystemStorage.html +376 -0
  6. data/docs/Ankusa/HBaseStorage.html +845 -0
  7. data/docs/Ankusa/KLDivergenceClassifier.html +265 -0
  8. data/docs/Ankusa/MemoryStorage.html +672 -0
  9. data/docs/Ankusa/NaiveBayesClassifier.html +313 -0
  10. data/docs/Ankusa/TextHash.html +390 -0
  11. data/docs/README_rdoc.html +268 -0
  12. data/docs/String.html +241 -0
  13. data/docs/created.rid +14 -0
  14. data/docs/images/brick.png +0 -0
  15. data/docs/images/brick_link.png +0 -0
  16. data/docs/images/bug.png +0 -0
  17. data/docs/images/bullet_black.png +0 -0
  18. data/docs/images/bullet_toggle_minus.png +0 -0
  19. data/docs/images/bullet_toggle_plus.png +0 -0
  20. data/docs/images/date.png +0 -0
  21. data/docs/images/find.png +0 -0
  22. data/docs/images/loadingAnimation.gif +0 -0
  23. data/docs/images/macFFBgHack.png +0 -0
  24. data/docs/images/package.png +0 -0
  25. data/docs/images/page_green.png +0 -0
  26. data/docs/images/page_white_text.png +0 -0
  27. data/docs/images/page_white_width.png +0 -0
  28. data/docs/images/plugin.png +0 -0
  29. data/docs/images/ruby.png +0 -0
  30. data/docs/images/tag_green.png +0 -0
  31. data/docs/images/wrench.png +0 -0
  32. data/docs/images/wrench_orange.png +0 -0
  33. data/docs/images/zoom.png +0 -0
  34. data/docs/index.html +212 -0
  35. data/docs/js/darkfish.js +116 -0
  36. data/docs/js/jquery.js +32 -0
  37. data/docs/js/quicksearch.js +114 -0
  38. data/docs/js/thickbox-compressed.js +10 -0
  39. data/docs/lib/ankusa/cassandra_storage_rb.html +54 -0
  40. data/docs/lib/ankusa/classifier_rb.html +52 -0
  41. data/docs/lib/ankusa/extensions_rb.html +54 -0
  42. data/docs/lib/ankusa/file_system_storage_rb.html +54 -0
  43. data/docs/lib/ankusa/hasher_rb.html +56 -0
  44. data/docs/lib/ankusa/hbase_storage_rb.html +54 -0
  45. data/docs/lib/ankusa/kl_divergence_rb.html +52 -0
  46. data/docs/lib/ankusa/memory_storage_rb.html +52 -0
  47. data/docs/lib/ankusa/naive_bayes_rb.html +52 -0
  48. data/docs/lib/ankusa/stopwords_rb.html +52 -0
  49. data/docs/lib/ankusa/version_rb.html +52 -0
  50. data/docs/lib/ankusa_rb.html +64 -0
  51. data/docs/rdoc.css +759 -0
  52. data/lib/ankusa/cassandra_storage.rb +2 -2
  53. data/lib/ankusa/classifier.rb +2 -2
  54. data/lib/ankusa/hasher.rb +17 -17
  55. data/lib/ankusa/hbase_storage.rb +2 -2
  56. data/lib/ankusa/stopwords.rb +1 -1
  57. data/lib/ankusa/version.rb +1 -1
  58. metadata +56 -8
@@ -0,0 +1,268 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
+
5
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6
+ <head>
7
+ <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
8
+
9
+ <title>File: README.rdoc [Ankusa - Naive Bayes classifier with big data storage]</title>
10
+
11
+ <link type="text/css" media="screen" href="./rdoc.css" rel="stylesheet" />
12
+
13
+ <script src="./js/jquery.js" type="text/javascript"
14
+ charset="utf-8"></script>
15
+ <script src="./js/thickbox-compressed.js" type="text/javascript"
16
+ charset="utf-8"></script>
17
+ <script src="./js/quicksearch.js" type="text/javascript"
18
+ charset="utf-8"></script>
19
+ <script src="./js/darkfish.js" type="text/javascript"
20
+ charset="utf-8"></script>
21
+ </head>
22
+
23
+ <body class="file">
24
+ <div id="metadata">
25
+ <div id="home-metadata">
26
+ <div id="home-section" class="section">
27
+ <h3 class="section-header">
28
+ <a href="./index.html">Home</a>
29
+ <a href="./index.html#classes">Classes</a>
30
+ <a href="./index.html#methods">Methods</a>
31
+ </h3>
32
+ </div>
33
+ </div>
34
+
35
+ <div id="project-metadata">
36
+
37
+
38
+ <div id="fileindex-section" class="section project-section">
39
+ <h3 class="section-header">Files</h3>
40
+ <ul>
41
+
42
+ <li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
43
+
44
+ </ul>
45
+ </div>
46
+
47
+
48
+ <div id="classindex-section" class="section project-section">
49
+ <h3 class="section-header">Class Index
50
+ <span class="search-toggle"><img src="./images/find.png"
51
+ height="16" width="16" alt="[+]"
52
+ title="show/hide quicksearch" /></span></h3>
53
+ <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
54
+ <fieldset>
55
+ <legend>Quicksearch</legend>
56
+ <input type="text" name="quicksearch" value=""
57
+ class="quicksearch-field" />
58
+ </fieldset>
59
+ </form>
60
+
61
+ <ul class="link-list">
62
+
63
+ <li><a href="./Ankusa.html">Ankusa</a></li>
64
+
65
+ <li><a href="./Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
66
+
67
+ <li><a href="./Ankusa/Classifier.html">Ankusa::Classifier</a></li>
68
+
69
+ <li><a href="./Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
70
+
71
+ <li><a href="./Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
72
+
73
+ <li><a href="./Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
74
+
75
+ <li><a href="./Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
76
+
77
+ <li><a href="./Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
78
+
79
+ <li><a href="./Ankusa/TextHash.html">Ankusa::TextHash</a></li>
80
+
81
+ <li><a href="./String.html">String</a></li>
82
+
83
+ </ul>
84
+ <div id="no-class-search-results" style="display: none;">No matching classes.</div>
85
+ </div>
86
+
87
+
88
+ </div>
89
+ </div>
90
+
91
+ <div id="documentation">
92
+
93
+ <h1>ankusa</h1>
94
+
95
+ <p><a href="Ankusa.html">Ankusa</a> is a text classifier in Ruby that can use
96
+ either Hadoop’s HBase or Cassandra for storage. Because it uses HBase or
97
+ Cassandra as a backend, the training corpus can be many terabytes in size
98
+ (though additional memory and single file storage abilities also exist for
99
+ smaller corpora).</p>
100
+
101
+ <p><a href="Ankusa.html">Ankusa</a> currently provides both a Naive Bayes and
102
+ Kullback-Leibler divergence classifier. It ignores common words (a.k.a,
103
+ stop words) and stems all others. Additionally, it uses Laplacian
104
+ smoothing in both classification methods.</p>
105
+
106
+ <h2>Installation</h2>
107
+
108
+ <p>First, install HBase/Hadoop or Cassandra (&gt;= 0.7.0-rc2). Then, install
109
+ the appropriate gem:</p>
110
+
111
+ <pre>gem install hbaserb
112
+ # or
113
+ gem install cassandra</pre>
114
+
115
+ <p>If you’re using HBase, make sure the HBase Thrift interface has been
116
+ started as well. Then:</p>
117
+
118
+ <pre>gem install ankusa</pre>
119
+
120
+ <h2>Basic Usage</h2>
121
+
122
+ <p>Using the naive Bayes classifier:</p>
123
+
124
+ <pre>require 'rubygems'
125
+ require 'ankusa'
126
+ require 'ankusa/hbase_storage'
127
+
128
+ # connect to HBase. Alternatively, just for this test, use in memory storage with
129
+ # storage = Ankusa::MemoryStorage.new
130
+ storage = Ankusa::HBaseStorage.new 'localhost'
131
+ c = Ankusa::NaiveBayesClassifier.new storage
132
+
133
+ # Each of these calls will return a bag-of-words
134
+ # has with stemmed words as keys and counts as values
135
+ c.train :spam, &quot;This is some spammy text&quot;
136
+ c.train :good, &quot;This is not the bad stuff&quot;
137
+
138
+ # This will return the most likely class (as symbol)
139
+ puts c.classify &quot;This is some spammy text&quot;
140
+
141
+ # This will return Hash with classes as keys and
142
+ # membership probability as values
143
+ puts c.classifications &quot;This is some spammy text&quot;
144
+
145
+ # If you have a large corpus, the probabilities will
146
+ # likely all be 0. In that case, you must use log
147
+ # likelihood values
148
+ puts c.log_likelihoods &quot;This is some spammy text&quot;
149
+
150
+ # get a list of all classes
151
+ puts c.classnames
152
+
153
+ # close connection
154
+ storage.close</pre>
155
+
156
+ <h2>KL Diverence Classifier</h2>
157
+
158
+ <p>There is a Kullback–Leibler divergence classifier as well. KL divergence
159
+ is a distance measure (though not a true metric because it does not satisfy
160
+ the triangle inequality). The KL classifier simply measures the relative
161
+ entropy between the text you want to classify and each of the classes. The
162
+ class with the shortest “distance” is the best class. You may find
163
+ that for a especially large corpus it may be slightly faster to use this
164
+ classifier (since prior probablities are never calculated, only
165
+ likelihoods).</p>
166
+
167
+ <p>The API is the same as the NaiveBayesClassifier, except rather than calling
168
+ “classifications” if you want actual numbers you call “distances”.</p>
169
+
170
+ <pre>require 'rubygems'
171
+ require 'ankusa'
172
+ require 'ankusa/hbase_storage'
173
+
174
+ # connect to HBase
175
+ storage = Ankusa::HBaseStorage.new 'localhost'
176
+ c = Ankusa::KLDivergenceClassifier.new storage
177
+
178
+ # Each of these calls will return a bag-of-words
179
+ # has with stemmed words as keys and counts as values
180
+ c.train :spam, &quot;This is some spammy text&quot;
181
+ c.train :good, &quot;This is not the bad stuff&quot;
182
+
183
+ # This will return the most likely class (as symbol)
184
+ puts c.classify &quot;This is some spammy text&quot;
185
+
186
+ # This will return Hash with classes as keys and
187
+ # distances &gt;= 0 as values
188
+ puts c.distances &quot;This is some spammy text&quot;
189
+
190
+ # get a list of all classes
191
+ puts c.classnames
192
+
193
+ # close connection
194
+ storage.close</pre>
195
+
196
+ <h2>Storage Methods</h2>
197
+
198
+ <p><a href="Ankusa.html">Ankusa</a> has a generalized storage interface that
199
+ has been implemented for HBase, Cassandra, single file, and in-memory
200
+ storage.</p>
201
+
202
+ <p>Memory storage can be used when you have a very small corpora</p>
203
+
204
+ <pre>require 'ankusa/memory_storage'
205
+ storage = Ankusa::MemoryStorage.new</pre>
206
+
207
+ <p>FileSystem storage can be used when you have a very small corpora and want
208
+ to persist the classification results.</p>
209
+
210
+ <pre>require 'ankusa/file_system_storage'
211
+ storage = Ankusa::FileSystemStorage.new '/path/to/file'
212
+ # Do classification ...
213
+ storage.save</pre>
214
+
215
+ <p>The FileSystem storage does NOT save to the filesystem automatically, the
216
+ #save method must be invoked to save and persist the results</p>
217
+
218
+ <p>HBase storage:</p>
219
+
220
+ <pre>require 'ankusa/hbase_storage'
221
+ # defaults: host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;
222
+ storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename</pre>
223
+
224
+ <p>For Cassandra storage:</p>
225
+ <ul><li>
226
+ <p>You will need Cassandra version 0.7.0-rc2 or greater.</p>
227
+ </li><li>
228
+ <p>You will need to set a max number classes since current implementation of
229
+ the Ruby Cassandra client doesn’t support table scans.</p>
230
+ </li><li>
231
+ <p>Prior to using the Cassandra storage you will need to run the following
232
+ command from the cassandra-cli: “create keyspace ankusa with
233
+ replication_factor = 1”. This should be fixed with a new release
234
+ candidate for Cassandra.</p>
235
+ </li></ul>
236
+
237
+ <p>To use the Cassandra storage class:</p>
238
+
239
+ <pre>require 'ankusa/cassandra_storage'
240
+ # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
241
+ storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes</pre>
242
+
243
+ <h2>Running Tests</h2>
244
+
245
+ <p>You can run the tests for any of the four storage methods. For instance,
246
+ for memory storage:</p>
247
+
248
+ <pre>rake test_memory</pre>
249
+
250
+ <p>For the other methods you will need to edit the file test/config.yml and
251
+ set the configuration params. Then:</p>
252
+
253
+ <pre>rake test_hbase
254
+ # or
255
+ rake test_cassandra
256
+ # or
257
+ rake test_filesystem</pre>
258
+
259
+ </div>
260
+
261
+ <div id="validator-badges">
262
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
263
+ <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
264
+ Rdoc Generator</a> 2</small>.</p>
265
+ </div>
266
+ </body>
267
+ </html>
268
+
data/docs/String.html ADDED
@@ -0,0 +1,241 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
7
+
8
+ <title>Class: String</title>
9
+
10
+ <link rel="stylesheet" href="./rdoc.css" type="text/css" media="screen" />
11
+
12
+ <script src="./js/jquery.js" type="text/javascript" charset="utf-8"></script>
13
+ <script src="./js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
14
+ <script src="./js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
15
+ <script src="./js/darkfish.js" type="text/javascript" charset="utf-8"></script>
16
+
17
+ </head>
18
+ <body id="top" class="class">
19
+
20
+ <div id="metadata">
21
+ <div id="home-metadata">
22
+ <div id="home-section" class="section">
23
+ <h3 class="section-header">
24
+ <a href="./index.html">Home</a>
25
+ <a href="./index.html#classes">Classes</a>
26
+ <a href="./index.html#methods">Methods</a>
27
+ </h3>
28
+ </div>
29
+ </div>
30
+
31
+ <div id="file-metadata">
32
+ <div id="file-list-section" class="section">
33
+ <h3 class="section-header">In Files</h3>
34
+ <div class="section-body">
35
+ <ul>
36
+
37
+ <li><a href="./lib/ankusa/extensions_rb.html?TB_iframe=true&amp;height=550&amp;width=785"
38
+ class="thickbox" title="lib/ankusa/extensions.rb">lib/ankusa/extensions.rb</a></li>
39
+
40
+ </ul>
41
+ </div>
42
+ </div>
43
+
44
+
45
+ </div>
46
+
47
+ <div id="class-metadata">
48
+
49
+ <!-- Parent Class -->
50
+ <div id="parent-class-section" class="section">
51
+ <h3 class="section-header">Parent</h3>
52
+
53
+ <p class="link">Object</p>
54
+
55
+ </div>
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ <!-- Method Quickref -->
64
+ <div id="method-list-section" class="section">
65
+ <h3 class="section-header">Methods</h3>
66
+ <ul class="link-list">
67
+
68
+ <li><a href="#method-i-numeric-3F">#numeric?</a></li>
69
+
70
+ <li><a href="#method-i-to_ascii">#to_ascii</a></li>
71
+
72
+ </ul>
73
+ </div>
74
+
75
+
76
+
77
+ </div>
78
+
79
+ <div id="project-metadata">
80
+
81
+
82
+ <div id="fileindex-section" class="section project-section">
83
+ <h3 class="section-header">Files</h3>
84
+ <ul>
85
+
86
+ <li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
87
+
88
+ </ul>
89
+ </div>
90
+
91
+
92
+ <div id="classindex-section" class="section project-section">
93
+ <h3 class="section-header">Class/Module Index
94
+ <span class="search-toggle"><img src="./images/find.png"
95
+ height="16" width="16" alt="[+]"
96
+ title="show/hide quicksearch" /></span></h3>
97
+ <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
98
+ <fieldset>
99
+ <legend>Quicksearch</legend>
100
+ <input type="text" name="quicksearch" value=""
101
+ class="quicksearch-field" />
102
+ </fieldset>
103
+ </form>
104
+
105
+ <ul class="link-list">
106
+
107
+ <li><a href="./Ankusa.html">Ankusa</a></li>
108
+
109
+ <li><a href="./Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
110
+
111
+ <li><a href="./Ankusa/Classifier.html">Ankusa::Classifier</a></li>
112
+
113
+ <li><a href="./Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
114
+
115
+ <li><a href="./Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
116
+
117
+ <li><a href="./Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
118
+
119
+ <li><a href="./Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
120
+
121
+ <li><a href="./Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
122
+
123
+ <li><a href="./Ankusa/TextHash.html">Ankusa::TextHash</a></li>
124
+
125
+ <li><a href="./String.html">String</a></li>
126
+
127
+ </ul>
128
+ <div id="no-class-search-results" style="display: none;">No matching classes.</div>
129
+ </div>
130
+
131
+
132
+ </div>
133
+ </div>
134
+
135
+ <div id="documentation">
136
+ <h1 class="class">String</h1>
137
+
138
+ <div id="description" class="description">
139
+
140
+ </div><!-- description -->
141
+
142
+
143
+ <div id="5Buntitled-5D" class="documentation-section">
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+ <!-- Methods -->
153
+
154
+ <div id="public-instance-method-details" class="method-section section">
155
+ <h3 class="section-header">Public Instance Methods</h3>
156
+
157
+
158
+ <div id="numeric-3F-method" class="method-detail ">
159
+ <a name="method-i-numeric-3F"></a>
160
+
161
+
162
+ <div class="method-heading">
163
+ <span class="method-name">numeric?</span><span
164
+ class="method-args">()</span>
165
+ <span class="method-click-advice">click to toggle source</span>
166
+ </div>
167
+
168
+
169
+ <div class="method-description">
170
+
171
+
172
+
173
+
174
+
175
+ <div class="method-source-code" id="numeric-3F-source">
176
+ <pre>
177
+ <span class="ruby-comment"># File lib/ankusa/extensions.rb, line 4</span>
178
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">numeric?</span>
179
+ <span class="ruby-keyword">true</span> <span class="ruby-keyword">if</span> <span class="ruby-constant">Float</span>(<span class="ruby-keyword">self</span>) <span class="ruby-keyword">rescue</span> <span class="ruby-keyword">false</span>
180
+ <span class="ruby-keyword">end</span></pre>
181
+ </div><!-- numeric-3F-source -->
182
+
183
+ </div>
184
+
185
+
186
+
187
+
188
+ </div><!-- numeric-3F-method -->
189
+
190
+
191
+ <div id="to_ascii-method" class="method-detail ">
192
+ <a name="method-i-to_ascii"></a>
193
+
194
+
195
+ <div class="method-heading">
196
+ <span class="method-name">to_ascii</span><span
197
+ class="method-args">()</span>
198
+ <span class="method-click-advice">click to toggle source</span>
199
+ </div>
200
+
201
+
202
+ <div class="method-description">
203
+
204
+
205
+
206
+
207
+
208
+ <div class="method-source-code" id="to_ascii-source">
209
+ <pre>
210
+ <span class="ruby-comment"># File lib/ankusa/extensions.rb, line 8</span>
211
+ <span class="ruby-keyword">def</span> <span class="ruby-identifier">to_ascii</span>
212
+ <span class="ruby-comment"># from http://www.jroller.com/obie/tags/unicode</span>
213
+ <span class="ruby-identifier">converter</span> = <span class="ruby-constant">Iconv</span>.<span class="ruby-identifier">new</span>(<span class="ruby-string">'ASCII//IGNORE//TRANSLIT'</span>, <span class="ruby-string">'UTF-8'</span>)
214
+ <span class="ruby-identifier">converter</span>.<span class="ruby-identifier">iconv</span>(<span class="ruby-keyword">self</span>).<span class="ruby-identifier">unpack</span>(<span class="ruby-string">'U*'</span>).<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cp</span><span class="ruby-operator">|</span> <span class="ruby-identifier">cp</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">127</span> }.<span class="ruby-identifier">pack</span>(<span class="ruby-string">'U*'</span>) <span class="ruby-keyword">rescue</span> <span class="ruby-string">&quot;&quot;</span>
215
+ <span class="ruby-keyword">end</span></pre>
216
+ </div><!-- to_ascii-source -->
217
+
218
+ </div>
219
+
220
+
221
+
222
+
223
+ </div><!-- to_ascii-method -->
224
+
225
+
226
+ </div><!-- public-instance-method-details -->
227
+
228
+ </div><!-- 5Buntitled-5D -->
229
+
230
+
231
+ </div><!-- documentation -->
232
+
233
+ <div id="validator-badges">
234
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
235
+ <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
236
+ Rdoc Generator</a> 2</small>.</p>
237
+ </div>
238
+
239
+ </body>
240
+ </html>
241
+