ankusa 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Gemfile +4 -0
  2. data/Gemfile.lock +16 -0
  3. data/README.rdoc +5 -3
  4. data/Rakefile +5 -5
  5. data/lib/ankusa/naive_bayes.rb +3 -3
  6. data/lib/ankusa/version.rb +1 -1
  7. metadata +36 -100
  8. data/docs/Ankusa.html +0 -229
  9. data/docs/Ankusa/CassandraStorage.html +0 -801
  10. data/docs/Ankusa/Classifier.html +0 -440
  11. data/docs/Ankusa/FileSystemStorage.html +0 -376
  12. data/docs/Ankusa/HBaseStorage.html +0 -845
  13. data/docs/Ankusa/KLDivergenceClassifier.html +0 -265
  14. data/docs/Ankusa/MemoryStorage.html +0 -672
  15. data/docs/Ankusa/NaiveBayesClassifier.html +0 -313
  16. data/docs/Ankusa/TextHash.html +0 -390
  17. data/docs/README_rdoc.html +0 -268
  18. data/docs/String.html +0 -241
  19. data/docs/created.rid +0 -14
  20. data/docs/images/brick.png +0 -0
  21. data/docs/images/brick_link.png +0 -0
  22. data/docs/images/bug.png +0 -0
  23. data/docs/images/bullet_black.png +0 -0
  24. data/docs/images/bullet_toggle_minus.png +0 -0
  25. data/docs/images/bullet_toggle_plus.png +0 -0
  26. data/docs/images/date.png +0 -0
  27. data/docs/images/find.png +0 -0
  28. data/docs/images/loadingAnimation.gif +0 -0
  29. data/docs/images/macFFBgHack.png +0 -0
  30. data/docs/images/package.png +0 -0
  31. data/docs/images/page_green.png +0 -0
  32. data/docs/images/page_white_text.png +0 -0
  33. data/docs/images/page_white_width.png +0 -0
  34. data/docs/images/plugin.png +0 -0
  35. data/docs/images/ruby.png +0 -0
  36. data/docs/images/tag_green.png +0 -0
  37. data/docs/images/wrench.png +0 -0
  38. data/docs/images/wrench_orange.png +0 -0
  39. data/docs/images/zoom.png +0 -0
  40. data/docs/index.html +0 -212
  41. data/docs/js/darkfish.js +0 -116
  42. data/docs/js/jquery.js +0 -32
  43. data/docs/js/quicksearch.js +0 -114
  44. data/docs/js/thickbox-compressed.js +0 -10
  45. data/docs/lib/ankusa/cassandra_storage_rb.html +0 -54
  46. data/docs/lib/ankusa/classifier_rb.html +0 -52
  47. data/docs/lib/ankusa/extensions_rb.html +0 -54
  48. data/docs/lib/ankusa/file_system_storage_rb.html +0 -54
  49. data/docs/lib/ankusa/hasher_rb.html +0 -56
  50. data/docs/lib/ankusa/hbase_storage_rb.html +0 -54
  51. data/docs/lib/ankusa/kl_divergence_rb.html +0 -52
  52. data/docs/lib/ankusa/memory_storage_rb.html +0 -52
  53. data/docs/lib/ankusa/naive_bayes_rb.html +0 -52
  54. data/docs/lib/ankusa/stopwords_rb.html +0 -52
  55. data/docs/lib/ankusa/version_rb.html +0 -52
  56. data/docs/lib/ankusa_rb.html +0 -64
  57. data/docs/rdoc.css +0 -759
@@ -1,268 +0,0 @@
1
- <?xml version="1.0" encoding="utf-8"?>
2
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
-
5
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6
- <head>
7
- <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
8
-
9
- <title>File: README.rdoc [Ankusa - Naive Bayes classifier with big data storage]</title>
10
-
11
- <link type="text/css" media="screen" href="./rdoc.css" rel="stylesheet" />
12
-
13
- <script src="./js/jquery.js" type="text/javascript"
14
- charset="utf-8"></script>
15
- <script src="./js/thickbox-compressed.js" type="text/javascript"
16
- charset="utf-8"></script>
17
- <script src="./js/quicksearch.js" type="text/javascript"
18
- charset="utf-8"></script>
19
- <script src="./js/darkfish.js" type="text/javascript"
20
- charset="utf-8"></script>
21
- </head>
22
-
23
- <body class="file">
24
- <div id="metadata">
25
- <div id="home-metadata">
26
- <div id="home-section" class="section">
27
- <h3 class="section-header">
28
- <a href="./index.html">Home</a>
29
- <a href="./index.html#classes">Classes</a>
30
- <a href="./index.html#methods">Methods</a>
31
- </h3>
32
- </div>
33
- </div>
34
-
35
- <div id="project-metadata">
36
-
37
-
38
- <div id="fileindex-section" class="section project-section">
39
- <h3 class="section-header">Files</h3>
40
- <ul>
41
-
42
- <li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
43
-
44
- </ul>
45
- </div>
46
-
47
-
48
- <div id="classindex-section" class="section project-section">
49
- <h3 class="section-header">Class Index
50
- <span class="search-toggle"><img src="./images/find.png"
51
- height="16" width="16" alt="[+]"
52
- title="show/hide quicksearch" /></span></h3>
53
- <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
54
- <fieldset>
55
- <legend>Quicksearch</legend>
56
- <input type="text" name="quicksearch" value=""
57
- class="quicksearch-field" />
58
- </fieldset>
59
- </form>
60
-
61
- <ul class="link-list">
62
-
63
- <li><a href="./Ankusa.html">Ankusa</a></li>
64
-
65
- <li><a href="./Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
66
-
67
- <li><a href="./Ankusa/Classifier.html">Ankusa::Classifier</a></li>
68
-
69
- <li><a href="./Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
70
-
71
- <li><a href="./Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
72
-
73
- <li><a href="./Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
74
-
75
- <li><a href="./Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
76
-
77
- <li><a href="./Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
78
-
79
- <li><a href="./Ankusa/TextHash.html">Ankusa::TextHash</a></li>
80
-
81
- <li><a href="./String.html">String</a></li>
82
-
83
- </ul>
84
- <div id="no-class-search-results" style="display: none;">No matching classes.</div>
85
- </div>
86
-
87
-
88
- </div>
89
- </div>
90
-
91
- <div id="documentation">
92
-
93
- <h1>ankusa</h1>
94
-
95
- <p><a href="Ankusa.html">Ankusa</a> is a text classifier in Ruby that can use
96
- either Hadoop’s HBase or Cassandra for storage. Because it uses HBase or
97
- Cassandra as a backend, the training corpus can be many terabytes in size
98
- (though additional memory and single file storage abilities also exist for
99
- smaller corpora).</p>
100
-
101
- <p><a href="Ankusa.html">Ankusa</a> currently provides both a Naive Bayes and
102
- Kullback-Leibler divergence classifier. It ignores common words (a.k.a,
103
- stop words) and stems all others. Additionally, it uses Laplacian
104
- smoothing in both classification methods.</p>
105
-
106
- <h2>Installation</h2>
107
-
108
- <p>First, install HBase/Hadoop or Cassandra (&gt;= 0.7.0-rc2). Then, install
109
- the appropriate gem:</p>
110
-
111
- <pre>gem install hbaserb
112
- # or
113
- gem install cassandra</pre>
114
-
115
- <p>If you’re using HBase, make sure the HBase Thrift interface has been
116
- started as well. Then:</p>
117
-
118
- <pre>gem install ankusa</pre>
119
-
120
- <h2>Basic Usage</h2>
121
-
122
- <p>Using the naive Bayes classifier:</p>
123
-
124
- <pre>require 'rubygems'
125
- require 'ankusa'
126
- require 'ankusa/hbase_storage'
127
-
128
- # connect to HBase. Alternatively, just for this test, use in memory storage with
129
- # storage = Ankusa::MemoryStorage.new
130
- storage = Ankusa::HBaseStorage.new 'localhost'
131
- c = Ankusa::NaiveBayesClassifier.new storage
132
-
133
- # Each of these calls will return a bag-of-words
134
- # has with stemmed words as keys and counts as values
135
- c.train :spam, &quot;This is some spammy text&quot;
136
- c.train :good, &quot;This is not the bad stuff&quot;
137
-
138
- # This will return the most likely class (as symbol)
139
- puts c.classify &quot;This is some spammy text&quot;
140
-
141
- # This will return Hash with classes as keys and
142
- # membership probability as values
143
- puts c.classifications &quot;This is some spammy text&quot;
144
-
145
- # If you have a large corpus, the probabilities will
146
- # likely all be 0. In that case, you must use log
147
- # likelihood values
148
- puts c.log_likelihoods &quot;This is some spammy text&quot;
149
-
150
- # get a list of all classes
151
- puts c.classnames
152
-
153
- # close connection
154
- storage.close</pre>
155
-
156
- <h2>KL Diverence Classifier</h2>
157
-
158
- <p>There is a Kullback–Leibler divergence classifier as well. KL divergence
159
- is a distance measure (though not a true metric because it does not satisfy
160
- the triangle inequality). The KL classifier simply measures the relative
161
- entropy between the text you want to classify and each of the classes. The
162
- class with the shortest “distance” is the best class. You may find
163
- that for a especially large corpus it may be slightly faster to use this
164
- classifier (since prior probablities are never calculated, only
165
- likelihoods).</p>
166
-
167
- <p>The API is the same as the NaiveBayesClassifier, except rather than calling
168
- “classifications” if you want actual numbers you call “distances”.</p>
169
-
170
- <pre>require 'rubygems'
171
- require 'ankusa'
172
- require 'ankusa/hbase_storage'
173
-
174
- # connect to HBase
175
- storage = Ankusa::HBaseStorage.new 'localhost'
176
- c = Ankusa::KLDivergenceClassifier.new storage
177
-
178
- # Each of these calls will return a bag-of-words
179
- # has with stemmed words as keys and counts as values
180
- c.train :spam, &quot;This is some spammy text&quot;
181
- c.train :good, &quot;This is not the bad stuff&quot;
182
-
183
- # This will return the most likely class (as symbol)
184
- puts c.classify &quot;This is some spammy text&quot;
185
-
186
- # This will return Hash with classes as keys and
187
- # distances &gt;= 0 as values
188
- puts c.distances &quot;This is some spammy text&quot;
189
-
190
- # get a list of all classes
191
- puts c.classnames
192
-
193
- # close connection
194
- storage.close</pre>
195
-
196
- <h2>Storage Methods</h2>
197
-
198
- <p><a href="Ankusa.html">Ankusa</a> has a generalized storage interface that
199
- has been implemented for HBase, Cassandra, single file, and in-memory
200
- storage.</p>
201
-
202
- <p>Memory storage can be used when you have a very small corpora</p>
203
-
204
- <pre>require 'ankusa/memory_storage'
205
- storage = Ankusa::MemoryStorage.new</pre>
206
-
207
- <p>FileSystem storage can be used when you have a very small corpora and want
208
- to persist the classification results.</p>
209
-
210
- <pre>require 'ankusa/file_system_storage'
211
- storage = Ankusa::FileSystemStorage.new '/path/to/file'
212
- # Do classification ...
213
- storage.save</pre>
214
-
215
- <p>The FileSystem storage does NOT save to the filesystem automatically, the
216
- #save method must be invoked to save and persist the results</p>
217
-
218
- <p>HBase storage:</p>
219
-
220
- <pre>require 'ankusa/hbase_storage'
221
- # defaults: host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;
222
- storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename</pre>
223
-
224
- <p>For Cassandra storage:</p>
225
- <ul><li>
226
- <p>You will need Cassandra version 0.7.0-rc2 or greater.</p>
227
- </li><li>
228
- <p>You will need to set a max number classes since current implementation of
229
- the Ruby Cassandra client doesn’t support table scans.</p>
230
- </li><li>
231
- <p>Prior to using the Cassandra storage you will need to run the following
232
- command from the cassandra-cli: “create keyspace ankusa with
233
- replication_factor = 1”. This should be fixed with a new release
234
- candidate for Cassandra.</p>
235
- </li></ul>
236
-
237
- <p>To use the Cassandra storage class:</p>
238
-
239
- <pre>require 'ankusa/cassandra_storage'
240
- # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
241
- storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes</pre>
242
-
243
- <h2>Running Tests</h2>
244
-
245
- <p>You can run the tests for any of the four storage methods. For instance,
246
- for memory storage:</p>
247
-
248
- <pre>rake test_memory</pre>
249
-
250
- <p>For the other methods you will need to edit the file test/config.yml and
251
- set the configuration params. Then:</p>
252
-
253
- <pre>rake test_hbase
254
- # or
255
- rake test_cassandra
256
- # or
257
- rake test_filesystem</pre>
258
-
259
- </div>
260
-
261
- <div id="validator-badges">
262
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
263
- <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
264
- Rdoc Generator</a> 2</small>.</p>
265
- </div>
266
- </body>
267
- </html>
268
-
@@ -1,241 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
4
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
- <head>
6
- <meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />
7
-
8
- <title>Class: String</title>
9
-
10
- <link rel="stylesheet" href="./rdoc.css" type="text/css" media="screen" />
11
-
12
- <script src="./js/jquery.js" type="text/javascript" charset="utf-8"></script>
13
- <script src="./js/thickbox-compressed.js" type="text/javascript" charset="utf-8"></script>
14
- <script src="./js/quicksearch.js" type="text/javascript" charset="utf-8"></script>
15
- <script src="./js/darkfish.js" type="text/javascript" charset="utf-8"></script>
16
-
17
- </head>
18
- <body id="top" class="class">
19
-
20
- <div id="metadata">
21
- <div id="home-metadata">
22
- <div id="home-section" class="section">
23
- <h3 class="section-header">
24
- <a href="./index.html">Home</a>
25
- <a href="./index.html#classes">Classes</a>
26
- <a href="./index.html#methods">Methods</a>
27
- </h3>
28
- </div>
29
- </div>
30
-
31
- <div id="file-metadata">
32
- <div id="file-list-section" class="section">
33
- <h3 class="section-header">In Files</h3>
34
- <div class="section-body">
35
- <ul>
36
-
37
- <li><a href="./lib/ankusa/extensions_rb.html?TB_iframe=true&amp;height=550&amp;width=785"
38
- class="thickbox" title="lib/ankusa/extensions.rb">lib/ankusa/extensions.rb</a></li>
39
-
40
- </ul>
41
- </div>
42
- </div>
43
-
44
-
45
- </div>
46
-
47
- <div id="class-metadata">
48
-
49
- <!-- Parent Class -->
50
- <div id="parent-class-section" class="section">
51
- <h3 class="section-header">Parent</h3>
52
-
53
- <p class="link">Object</p>
54
-
55
- </div>
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
- <!-- Method Quickref -->
64
- <div id="method-list-section" class="section">
65
- <h3 class="section-header">Methods</h3>
66
- <ul class="link-list">
67
-
68
- <li><a href="#method-i-numeric-3F">#numeric?</a></li>
69
-
70
- <li><a href="#method-i-to_ascii">#to_ascii</a></li>
71
-
72
- </ul>
73
- </div>
74
-
75
-
76
-
77
- </div>
78
-
79
- <div id="project-metadata">
80
-
81
-
82
- <div id="fileindex-section" class="section project-section">
83
- <h3 class="section-header">Files</h3>
84
- <ul>
85
-
86
- <li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
87
-
88
- </ul>
89
- </div>
90
-
91
-
92
- <div id="classindex-section" class="section project-section">
93
- <h3 class="section-header">Class/Module Index
94
- <span class="search-toggle"><img src="./images/find.png"
95
- height="16" width="16" alt="[+]"
96
- title="show/hide quicksearch" /></span></h3>
97
- <form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
98
- <fieldset>
99
- <legend>Quicksearch</legend>
100
- <input type="text" name="quicksearch" value=""
101
- class="quicksearch-field" />
102
- </fieldset>
103
- </form>
104
-
105
- <ul class="link-list">
106
-
107
- <li><a href="./Ankusa.html">Ankusa</a></li>
108
-
109
- <li><a href="./Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a></li>
110
-
111
- <li><a href="./Ankusa/Classifier.html">Ankusa::Classifier</a></li>
112
-
113
- <li><a href="./Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a></li>
114
-
115
- <li><a href="./Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a></li>
116
-
117
- <li><a href="./Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a></li>
118
-
119
- <li><a href="./Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a></li>
120
-
121
- <li><a href="./Ankusa/NaiveBayesClassifier.html">Ankusa::NaiveBayesClassifier</a></li>
122
-
123
- <li><a href="./Ankusa/TextHash.html">Ankusa::TextHash</a></li>
124
-
125
- <li><a href="./String.html">String</a></li>
126
-
127
- </ul>
128
- <div id="no-class-search-results" style="display: none;">No matching classes.</div>
129
- </div>
130
-
131
-
132
- </div>
133
- </div>
134
-
135
- <div id="documentation">
136
- <h1 class="class">String</h1>
137
-
138
- <div id="description" class="description">
139
-
140
- </div><!-- description -->
141
-
142
-
143
- <div id="5Buntitled-5D" class="documentation-section">
144
-
145
-
146
-
147
-
148
-
149
-
150
-
151
-
152
- <!-- Methods -->
153
-
154
- <div id="public-instance-method-details" class="method-section section">
155
- <h3 class="section-header">Public Instance Methods</h3>
156
-
157
-
158
- <div id="numeric-3F-method" class="method-detail ">
159
- <a name="method-i-numeric-3F"></a>
160
-
161
-
162
- <div class="method-heading">
163
- <span class="method-name">numeric?</span><span
164
- class="method-args">()</span>
165
- <span class="method-click-advice">click to toggle source</span>
166
- </div>
167
-
168
-
169
- <div class="method-description">
170
-
171
-
172
-
173
-
174
-
175
- <div class="method-source-code" id="numeric-3F-source">
176
- <pre>
177
- <span class="ruby-comment"># File lib/ankusa/extensions.rb, line 4</span>
178
- <span class="ruby-keyword">def</span> <span class="ruby-identifier">numeric?</span>
179
- <span class="ruby-keyword">true</span> <span class="ruby-keyword">if</span> <span class="ruby-constant">Float</span>(<span class="ruby-keyword">self</span>) <span class="ruby-keyword">rescue</span> <span class="ruby-keyword">false</span>
180
- <span class="ruby-keyword">end</span></pre>
181
- </div><!-- numeric-3F-source -->
182
-
183
- </div>
184
-
185
-
186
-
187
-
188
- </div><!-- numeric-3F-method -->
189
-
190
-
191
- <div id="to_ascii-method" class="method-detail ">
192
- <a name="method-i-to_ascii"></a>
193
-
194
-
195
- <div class="method-heading">
196
- <span class="method-name">to_ascii</span><span
197
- class="method-args">()</span>
198
- <span class="method-click-advice">click to toggle source</span>
199
- </div>
200
-
201
-
202
- <div class="method-description">
203
-
204
-
205
-
206
-
207
-
208
- <div class="method-source-code" id="to_ascii-source">
209
- <pre>
210
- <span class="ruby-comment"># File lib/ankusa/extensions.rb, line 8</span>
211
- <span class="ruby-keyword">def</span> <span class="ruby-identifier">to_ascii</span>
212
- <span class="ruby-comment"># from http://www.jroller.com/obie/tags/unicode</span>
213
- <span class="ruby-identifier">converter</span> = <span class="ruby-constant">Iconv</span>.<span class="ruby-identifier">new</span>(<span class="ruby-string">'ASCII//IGNORE//TRANSLIT'</span>, <span class="ruby-string">'UTF-8'</span>)
214
- <span class="ruby-identifier">converter</span>.<span class="ruby-identifier">iconv</span>(<span class="ruby-keyword">self</span>).<span class="ruby-identifier">unpack</span>(<span class="ruby-string">'U*'</span>).<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cp</span><span class="ruby-operator">|</span> <span class="ruby-identifier">cp</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">127</span> }.<span class="ruby-identifier">pack</span>(<span class="ruby-string">'U*'</span>) <span class="ruby-keyword">rescue</span> <span class="ruby-string">&quot;&quot;</span>
215
- <span class="ruby-keyword">end</span></pre>
216
- </div><!-- to_ascii-source -->
217
-
218
- </div>
219
-
220
-
221
-
222
-
223
- </div><!-- to_ascii-method -->
224
-
225
-
226
- </div><!-- public-instance-method-details -->
227
-
228
- </div><!-- 5Buntitled-5D -->
229
-
230
-
231
- </div><!-- documentation -->
232
-
233
- <div id="validator-badges">
234
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
235
- <p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
236
- Rdoc Generator</a> 2</small>.</p>
237
- </div>
238
-
239
- </body>
240
- </html>
241
-