ankusa 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. data/README.rdoc +2 -1
  2. data/Rakefile +4 -26
  3. data/lib/ankusa.rb +1 -0
  4. data/lib/ankusa/classifier.rb +3 -0
  5. data/lib/ankusa/naive_bayes.rb +8 -4
  6. data/lib/ankusa/version.rb +3 -0
  7. metadata +6 -33
  8. data/docs/classes/Ankusa.html +0 -182
  9. data/docs/classes/Ankusa/CassandraStorage.html +0 -615
  10. data/docs/classes/Ankusa/Classifier.html +0 -315
  11. data/docs/classes/Ankusa/FileSystemStorage.html +0 -272
  12. data/docs/classes/Ankusa/HBaseStorage.html +0 -594
  13. data/docs/classes/Ankusa/KLDivergenceClassifier.html +0 -194
  14. data/docs/classes/Ankusa/MemoryStorage.html +0 -467
  15. data/docs/classes/Ankusa/NaiveBayesClassifier.html +0 -231
  16. data/docs/classes/Ankusa/TextHash.html +0 -275
  17. data/docs/classes/String.html +0 -172
  18. data/docs/created.rid +0 -1
  19. data/docs/files/README_rdoc.html +0 -294
  20. data/docs/files/lib/ankusa/cassandra_storage_rb.html +0 -108
  21. data/docs/files/lib/ankusa/classifier_rb.html +0 -101
  22. data/docs/files/lib/ankusa/extensions_rb.html +0 -108
  23. data/docs/files/lib/ankusa/file_system_storage_rb.html +0 -108
  24. data/docs/files/lib/ankusa/hasher_rb.html +0 -109
  25. data/docs/files/lib/ankusa/hbase_storage_rb.html +0 -108
  26. data/docs/files/lib/ankusa/kl_divergence_rb.html +0 -101
  27. data/docs/files/lib/ankusa/memory_storage_rb.html +0 -101
  28. data/docs/files/lib/ankusa/naive_bayes_rb.html +0 -101
  29. data/docs/files/lib/ankusa/stopwords_rb.html +0 -101
  30. data/docs/files/lib/ankusa_rb.html +0 -112
  31. data/docs/fr_class_index.html +0 -36
  32. data/docs/fr_file_index.html +0 -38
  33. data/docs/fr_method_index.html +0 -95
  34. data/docs/index.html +0 -24
  35. data/docs/rdoc-style.css +0 -208
@@ -1,172 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: String</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">String</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../files/lib/ankusa/extensions_rb.html">
59
- lib/ankusa/extensions.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- Object
69
- </td>
70
- </tr>
71
- </table>
72
- </div>
73
- <!-- banner header -->
74
-
75
- <div id="bodyContent">
76
-
77
-
78
-
79
- <div id="contextContent">
80
-
81
-
82
-
83
- </div>
84
-
85
- <div id="method-list">
86
- <h3 class="section-bar">Methods</h3>
87
-
88
- <div class="name-list">
89
- <a href="#M000001">numeric?</a>&nbsp;&nbsp;
90
- <a href="#M000002">to_ascii</a>&nbsp;&nbsp;
91
- </div>
92
- </div>
93
-
94
- </div>
95
-
96
-
97
- <!-- if includes -->
98
-
99
- <div id="section">
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
- <!-- if method_list -->
109
- <div id="methods">
110
- <h3 class="section-bar">Public Instance methods</h3>
111
-
112
- <div id="method-M000001" class="method-detail">
113
- <a name="M000001"></a>
114
-
115
- <div class="method-heading">
116
- <a href="#M000001" class="method-signature">
117
- <span class="method-name">numeric?</span><span class="method-args">()</span>
118
- </a>
119
- </div>
120
-
121
- <div class="method-description">
122
- <p><a class="source-toggle" href="#"
123
- onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
124
- <div class="method-source-code" id="M000001-source">
125
- <pre>
126
- <span class="ruby-comment cmt"># File lib/ankusa/extensions.rb, line 4</span>
127
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">numeric?</span>
128
- <span class="ruby-keyword kw">true</span> <span class="ruby-keyword kw">if</span> <span class="ruby-constant">Float</span>(<span class="ruby-keyword kw">self</span>) <span class="ruby-keyword kw">rescue</span> <span class="ruby-keyword kw">false</span>
129
- <span class="ruby-keyword kw">end</span>
130
- </pre>
131
- </div>
132
- </div>
133
- </div>
134
-
135
- <div id="method-M000002" class="method-detail">
136
- <a name="M000002"></a>
137
-
138
- <div class="method-heading">
139
- <a href="#M000002" class="method-signature">
140
- <span class="method-name">to_ascii</span><span class="method-args">()</span>
141
- </a>
142
- </div>
143
-
144
- <div class="method-description">
145
- <p><a class="source-toggle" href="#"
146
- onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
147
- <div class="method-source-code" id="M000002-source">
148
- <pre>
149
- <span class="ruby-comment cmt"># File lib/ankusa/extensions.rb, line 8</span>
150
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">to_ascii</span>
151
- <span class="ruby-comment cmt"># from http://www.jroller.com/obie/tags/unicode</span>
152
- <span class="ruby-identifier">converter</span> = <span class="ruby-constant">Iconv</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'ASCII//IGNORE//TRANSLIT'</span>, <span class="ruby-value str">'UTF-8'</span>)
153
- <span class="ruby-identifier">converter</span>.<span class="ruby-identifier">iconv</span>(<span class="ruby-keyword kw">self</span>).<span class="ruby-identifier">unpack</span>(<span class="ruby-value str">'U*'</span>).<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cp</span><span class="ruby-operator">|</span> <span class="ruby-identifier">cp</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">127</span> }.<span class="ruby-identifier">pack</span>(<span class="ruby-value str">'U*'</span>) <span class="ruby-keyword kw">rescue</span> <span class="ruby-value str">&quot;&quot;</span>
154
- <span class="ruby-keyword kw">end</span>
155
- </pre>
156
- </div>
157
- </div>
158
- </div>
159
-
160
-
161
- </div>
162
-
163
-
164
- </div>
165
-
166
-
167
- <div id="validator-badges">
168
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
169
- </div>
170
-
171
- </body>
172
- </html>
@@ -1 +0,0 @@
1
- Wed, 05 Jan 2011 17:44:50 -0500
@@ -1,294 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>File: README.rdoc</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="fileHeader">
50
- <h1>README.rdoc</h1>
51
- <table class="header-table">
52
- <tr class="top-aligned-row">
53
- <td><strong>Path:</strong></td>
54
- <td>README.rdoc
55
- </td>
56
- </tr>
57
- <tr class="top-aligned-row">
58
- <td><strong>Last Update:</strong></td>
59
- <td>Wed Jan 05 17:43:40 -0500 2011</td>
60
- </tr>
61
- </table>
62
- </div>
63
- <!-- banner header -->
64
-
65
- <div id="bodyContent">
66
-
67
-
68
-
69
- <div id="contextContent">
70
-
71
- <div id="description">
72
- <h1>ankusa</h1>
73
- <p>
74
- <a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
75
- that can use either Hadoop&#8216;s HBase or Cassandra for storage. Because
76
- it uses HBase or Cassandra as a backend, the training corpus can be many
77
- terabytes in size (though additional memory and single file storage
78
- abilities also exist for smaller corpora).
79
- </p>
80
- <p>
81
- <a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
82
- Bayes and Kullback-Leibler divergence classifier. It ignores common words
83
- (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian
84
- smoothing in both classification methods.
85
- </p>
86
- <h2>Installation</h2>
87
- <p>
88
- First, install HBase/Hadoop or Cassandra (&gt;= 0.7.0-rc2). Then, install
89
- the appropriate gem:
90
- </p>
91
- <pre>
92
- gem install hbaserb
93
- # or
94
- gem install cassandra
95
- </pre>
96
- <p>
97
- If you&#8216;re using HBase, make sure the HBase Thrift interface has been
98
- started as well. Then:
99
- </p>
100
- <pre>
101
- gem install ankusa
102
- </pre>
103
- <h2>Basic Usage</h2>
104
- <p>
105
- Using the naive Bayes classifier:
106
- </p>
107
- <pre>
108
- require 'rubygems'
109
- require 'ankusa'
110
- require 'ankusa/hbase_storage'
111
-
112
- # connect to HBase
113
- storage = Ankusa::HBaseStorage.new 'localhost'
114
- c = Ankusa::NaiveBayesClassifier.new storage
115
-
116
- # Each of these calls will return a bag-of-words
117
- # has with stemmed words as keys and counts as values
118
- c.train :spam, &quot;This is some spammy text&quot;
119
- c.train :good, &quot;This is not the bad stuff&quot;
120
-
121
- # This will return the most likely class (as symbol)
122
- puts c.classify &quot;This is some spammy text&quot;
123
-
124
- # This will return Hash with classes as keys and
125
- # membership probability as values
126
- puts c.classifications &quot;This is some spammy text&quot;
127
-
128
- # If you have a large corpus, the probabilities will
129
- # likely all be 0. In that case, you must use log
130
- # likelihood values
131
- puts c.log_likelihoods &quot;This is some spammy text&quot;
132
-
133
- # get a list of all classes
134
- puts c.classnames
135
-
136
- # close connection
137
- storage.close
138
- </pre>
139
- <h2>KL Diverence Classifier</h2>
140
- <p>
141
- There is a Kullback–Leibler divergence classifier as well. KL divergence
142
- is a distance measure (though not a true metric because it does not satisfy
143
- the triangle inequality). The KL classifier simply measures the relative
144
- entropy between the text you want to classify and each of the classes. The
145
- class with the shortest &quot;distance&quot; is the best class. You may
146
- find that for a especially large corpus it may be slightly faster to use
147
- this classifier (since prior probablities are never calculated, only
148
- likelihoods).
149
- </p>
150
- <p>
151
- The API is the same as the NaiveBayesClassifier, except rather than calling
152
- &quot;classifications&quot; if you want actual numbers you call
153
- &quot;distances&quot;.
154
- </p>
155
- <pre>
156
- require 'rubygems'
157
- require 'ankusa'
158
- require 'ankusa/hbase_storage'
159
-
160
- # connect to HBase
161
- storage = Ankusa::HBaseStorage.new 'localhost'
162
- c = Ankusa::KLDivergenceClassifier.new storage
163
-
164
- # Each of these calls will return a bag-of-words
165
- # has with stemmed words as keys and counts as values
166
- c.train :spam, &quot;This is some spammy text&quot;
167
- c.train :good, &quot;This is not the bad stuff&quot;
168
-
169
- # This will return the most likely class (as symbol)
170
- puts c.classify &quot;This is some spammy text&quot;
171
-
172
- # This will return Hash with classes as keys and
173
- # distances &gt;= 0 as values
174
- puts c.distances &quot;This is some spammy text&quot;
175
-
176
- # get a list of all classes
177
- puts c.classnames
178
-
179
- # close connection
180
- storage.close
181
- </pre>
182
- <h2>Storage Methods</h2>
183
- <p>
184
- <a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
185
- interface that has been implemented for HBase, Cassandra, single file, and
186
- in-memory storage.
187
- </p>
188
- <p>
189
- Memory storage can be used when you have a very small corpora
190
- </p>
191
- <pre>
192
- require 'ankusa/memory_storage'
193
- storage = Ankusa::MemoryStorage.new
194
- </pre>
195
- <p>
196
- FileSystem storage can be used when you have a very small corpora and want
197
- to persist the classification results.
198
- </p>
199
- <pre>
200
- require 'ankusa/file_system_storage'
201
- storage = Ankusa::FileSystemStorage.new '/path/to/file'
202
- # Do classification ...
203
- storage.save
204
- </pre>
205
- <p>
206
- The FileSystem storage does NOT save to the filesystem automatically, the
207
- save method must be invoked to save and persist the results
208
- </p>
209
- <p>
210
- HBase storage:
211
- </p>
212
- <pre>
213
- require 'ankusa/hbase_storage'
214
- # defaults: host='localhost', port=9090, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;
215
- storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
216
- </pre>
217
- <p>
218
- For Cassandra storage:
219
- </p>
220
- <ul>
221
- <li>You will need Cassandra version 0.7.0-rc2 or greater.
222
-
223
- </li>
224
- <li>You will need to set a max number classes since current implementation of
225
- the Ruby Cassandra client doesn&#8216;t support table scans.
226
-
227
- </li>
228
- <li>Prior to using the Cassandra storage you will need to run the following
229
- command from the cassandra-cli: &quot;create keyspace ankusa with
230
- replication_factor = 1&quot;. This should be fixed with a new release
231
- candidate for Cassandra.
232
-
233
- </li>
234
- </ul>
235
- <p>
236
- To use the Cassandra storage class:
237
- </p>
238
- <pre>
239
- require 'ankusa/cassandra_storage'
240
- # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
241
- storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
242
- </pre>
243
- <h2>Running Tests</h2>
244
- <p>
245
- You can run the tests for any of the four storage methods. For instance,
246
- for memory storage:
247
- </p>
248
- <pre>
249
- rake test_memory
250
- </pre>
251
- <p>
252
- For the other methods you will need to edit the file test/config.yml and
253
- set the configuration params. Then:
254
- </p>
255
- <pre>
256
- rake test_hbase
257
- # or
258
- rake test_cassandra
259
- # or
260
- rake test_filesystem
261
- </pre>
262
-
263
- </div>
264
-
265
-
266
- </div>
267
-
268
-
269
- </div>
270
-
271
-
272
- <!-- if includes -->
273
-
274
- <div id="section">
275
-
276
-
277
-
278
-
279
-
280
-
281
-
282
-
283
- <!-- if method_list -->
284
-
285
-
286
- </div>
287
-
288
-
289
- <div id="validator-badges">
290
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
291
- </div>
292
-
293
- </body>
294
- </html>