ankusa 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +13 -3
- data/Rakefile +9 -2
- data/docs/classes/Ankusa.html +5 -0
- data/docs/classes/Ankusa/FileSystemStorage.html +272 -0
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +22 -5
- data/docs/files/lib/ankusa/file_system_storage_rb.html +108 -0
- data/docs/fr_class_index.html +1 -0
- data/docs/fr_file_index.html +1 -0
- data/docs/fr_method_index.html +22 -17
- data/lib/ankusa/file_system_storage.rb +55 -0
- metadata +7 -4
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= ankusa
|
2
2
|
|
3
|
-
Ankusa is a text classifier in Ruby that can use either Hadoop's HBase or Cassandra for storage. Because it uses HBase or Cassandra as a backend, the training corpus can be many terabytes in size.
|
3
|
+
Ankusa is a text classifier in Ruby that can use either Hadoop's HBase or Cassandra for storage. Because it uses HBase or Cassandra as a backend, the training corpus can be many terabytes in size (though additional memory and single file storage abilities also exist for smaller corpora).
|
4
4
|
|
5
5
|
Ankusa currently provides both a Naive Bayes and Kullback-Leibler divergence classifier. It ignores common words (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian smoothing in both classification methods.
|
6
6
|
|
@@ -80,12 +80,20 @@ The API is the same as the NaiveBayesClassifier, except rather than calling "cla
|
|
80
80
|
storage.close
|
81
81
|
|
82
82
|
== Storage Methods
|
83
|
-
Ankusa has a generalized storage interface that has been implemented for HBase, Cassandra, and in-memory storage.
|
83
|
+
Ankusa has a generalized storage interface that has been implemented for HBase, Cassandra, single file, and in-memory storage.
|
84
84
|
|
85
85
|
Memory storage can be used when you have a very small corpora
|
86
86
|
require 'ankusa/memory_storage'
|
87
87
|
storage = Ankusa::MemoryStorage.new
|
88
88
|
|
89
|
+
FileSystem storage can be used when you have a very small corpora and want to persist the classification results.
|
90
|
+
require 'ankusa/file_system_storage'
|
91
|
+
storage = Ankusa::FileSystemStorage.new '/path/to/file'
|
92
|
+
# Do classification ...
|
93
|
+
storage.save
|
94
|
+
|
95
|
+
The FileSystem storage does NOT save to the filesystem automatically, the #save method must be invoked to save and persist the results
|
96
|
+
|
89
97
|
HBase storage:
|
90
98
|
require 'ankusa/hbase_storage'
|
91
99
|
# defaults: host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary"
|
@@ -103,13 +111,15 @@ To use the Cassandra storage class:
|
|
103
111
|
|
104
112
|
|
105
113
|
== Running Tests
|
106
|
-
You can run the tests for any of the
|
114
|
+
You can run the tests for any of the four storage methods. For instance, for memory storage:
|
107
115
|
rake test_memory
|
108
116
|
|
109
117
|
For the other methods you will need to edit the file test/config.yml and set the configuration params. Then:
|
110
118
|
rake test_hbase
|
111
119
|
# or
|
112
120
|
rake test_cassandra
|
121
|
+
# or
|
122
|
+
rake test_filesystem
|
113
123
|
|
114
124
|
|
115
125
|
|
data/Rakefile
CHANGED
@@ -33,11 +33,18 @@ Rake::TestTask.new("test_cassandra") { |t|
|
|
33
33
|
t.verbose = true
|
34
34
|
}
|
35
35
|
|
36
|
+
desc "Run all unit tests with FileSystem storage"
|
37
|
+
Rake::TestTask.new("test_filesystem") { |t|
|
38
|
+
t.libs << "lib"
|
39
|
+
t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
|
40
|
+
t.verbose = true
|
41
|
+
}
|
42
|
+
|
36
43
|
spec = Gem::Specification.new do |s|
|
37
44
|
s.name = "ankusa"
|
38
|
-
s.version = "0.0.
|
45
|
+
s.version = "0.0.8"
|
39
46
|
s.authors = ["Brian Muller"]
|
40
|
-
s.date = %q{
|
47
|
+
s.date = %q{2011-01-05}
|
41
48
|
s.description = "Text classifier with HBase or Cassandra storage"
|
42
49
|
s.summary = "Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage"
|
43
50
|
s.email = "brian.muller@livingsocial.com"
|
data/docs/classes/Ankusa.html
CHANGED
@@ -62,6 +62,10 @@
|
|
62
62
|
<a href="../files/lib/ankusa/classifier_rb.html">
|
63
63
|
lib/ankusa/classifier.rb
|
64
64
|
</a>
|
65
|
+
<br />
|
66
|
+
<a href="../files/lib/ankusa/file_system_storage_rb.html">
|
67
|
+
lib/ankusa/file_system_storage.rb
|
68
|
+
</a>
|
65
69
|
<br />
|
66
70
|
<a href="../files/lib/ankusa/hasher_rb.html">
|
67
71
|
lib/ankusa/hasher.rb
|
@@ -130,6 +134,7 @@ cassandra
|
|
130
134
|
|
131
135
|
Module <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
|
132
136
|
Class <a href="Ankusa/CassandraStorage.html" class="link">Ankusa::CassandraStorage</a><br />
|
137
|
+
Class <a href="Ankusa/FileSystemStorage.html" class="link">Ankusa::FileSystemStorage</a><br />
|
133
138
|
Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
|
134
139
|
Class <a href="Ankusa/KLDivergenceClassifier.html" class="link">Ankusa::KLDivergenceClassifier</a><br />
|
135
140
|
Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
|
@@ -0,0 +1,272 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: Ankusa::FileSystemStorage</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">Ankusa::FileSystemStorage</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../../files/lib/ankusa/file_system_storage_rb.html">
|
59
|
+
lib/ankusa/file_system_storage.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
<a href="MemoryStorage.html">
|
69
|
+
MemoryStorage
|
70
|
+
</a>
|
71
|
+
</td>
|
72
|
+
</tr>
|
73
|
+
</table>
|
74
|
+
</div>
|
75
|
+
<!-- banner header -->
|
76
|
+
|
77
|
+
<div id="bodyContent">
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
<div id="contextContent">
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
</div>
|
86
|
+
|
87
|
+
<div id="method-list">
|
88
|
+
<h3 class="section-bar">Methods</h3>
|
89
|
+
|
90
|
+
<div class="name-list">
|
91
|
+
<a href="#M000067">drop_tables</a>
|
92
|
+
<a href="#M000068">init_tables</a>
|
93
|
+
<a href="#M000065">new</a>
|
94
|
+
<a href="#M000066">reset</a>
|
95
|
+
<a href="#M000069">save</a>
|
96
|
+
</div>
|
97
|
+
</div>
|
98
|
+
|
99
|
+
</div>
|
100
|
+
|
101
|
+
|
102
|
+
<!-- if includes -->
|
103
|
+
|
104
|
+
<div id="section">
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
<!-- if method_list -->
|
114
|
+
<div id="methods">
|
115
|
+
<h3 class="section-bar">Public Class methods</h3>
|
116
|
+
|
117
|
+
<div id="method-M000065" class="method-detail">
|
118
|
+
<a name="M000065"></a>
|
119
|
+
|
120
|
+
<div class="method-heading">
|
121
|
+
<a href="#M000065" class="method-signature">
|
122
|
+
<span class="method-name">new</span><span class="method-args">(file)</span>
|
123
|
+
</a>
|
124
|
+
</div>
|
125
|
+
|
126
|
+
<div class="method-description">
|
127
|
+
<p><a class="source-toggle" href="#"
|
128
|
+
onclick="toggleCode('M000065-source');return false;">[Source]</a></p>
|
129
|
+
<div class="method-source-code" id="M000065-source">
|
130
|
+
<pre>
|
131
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 7</span>
|
132
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">file</span>)
|
133
|
+
<span class="ruby-ivar">@file</span> = <span class="ruby-identifier">file</span>
|
134
|
+
<span class="ruby-identifier">init_tables</span>
|
135
|
+
<span class="ruby-keyword kw">end</span>
|
136
|
+
</pre>
|
137
|
+
</div>
|
138
|
+
</div>
|
139
|
+
</div>
|
140
|
+
|
141
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
142
|
+
|
143
|
+
<div id="method-M000067" class="method-detail">
|
144
|
+
<a name="M000067"></a>
|
145
|
+
|
146
|
+
<div class="method-heading">
|
147
|
+
<a href="#M000067" class="method-signature">
|
148
|
+
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
149
|
+
</a>
|
150
|
+
</div>
|
151
|
+
|
152
|
+
<div class="method-description">
|
153
|
+
<p><a class="source-toggle" href="#"
|
154
|
+
onclick="toggleCode('M000067-source');return false;">[Source]</a></p>
|
155
|
+
<div class="method-source-code" id="M000067-source">
|
156
|
+
<pre>
|
157
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 20</span>
|
158
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
159
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">delete</span>(<span class="ruby-ivar">@file</span>) <span class="ruby-keyword kw">rescue</span> <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">ENOENT</span>
|
160
|
+
<span class="ruby-identifier">reset</span>
|
161
|
+
<span class="ruby-keyword kw">end</span>
|
162
|
+
</pre>
|
163
|
+
</div>
|
164
|
+
</div>
|
165
|
+
</div>
|
166
|
+
|
167
|
+
<div id="method-M000068" class="method-detail">
|
168
|
+
<a name="M000068"></a>
|
169
|
+
|
170
|
+
<div class="method-heading">
|
171
|
+
<a href="#M000068" class="method-signature">
|
172
|
+
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
173
|
+
</a>
|
174
|
+
</div>
|
175
|
+
|
176
|
+
<div class="method-description">
|
177
|
+
<p><a class="source-toggle" href="#"
|
178
|
+
onclick="toggleCode('M000068-source');return false;">[Source]</a></p>
|
179
|
+
<div class="method-source-code" id="M000068-source">
|
180
|
+
<pre>
|
181
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 25</span>
|
182
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
183
|
+
<span class="ruby-identifier">data</span> = {}
|
184
|
+
<span class="ruby-keyword kw">begin</span>
|
185
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">open</span>(<span class="ruby-ivar">@file</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">f</span><span class="ruby-operator">|</span>
|
186
|
+
<span class="ruby-identifier">data</span> = <span class="ruby-constant">Marshal</span>.<span class="ruby-identifier">load</span>(<span class="ruby-identifier">f</span>)
|
187
|
+
<span class="ruby-keyword kw">end</span>
|
188
|
+
<span class="ruby-ivar">@freqs</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:freqs</span>]
|
189
|
+
<span class="ruby-ivar">@total_word_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:total_word_counts</span>]
|
190
|
+
<span class="ruby-ivar">@total_doc_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:total_doc_counts</span>]
|
191
|
+
<span class="ruby-ivar">@klass_word_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:klass_word_counts</span>]
|
192
|
+
<span class="ruby-ivar">@klass_doc_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:klass_word_counts</span>]
|
193
|
+
<span class="ruby-keyword kw">rescue</span> <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">ENOENT</span>
|
194
|
+
<span class="ruby-identifier">reset</span>
|
195
|
+
<span class="ruby-keyword kw">end</span>
|
196
|
+
<span class="ruby-keyword kw">end</span>
|
197
|
+
</pre>
|
198
|
+
</div>
|
199
|
+
</div>
|
200
|
+
</div>
|
201
|
+
|
202
|
+
<div id="method-M000066" class="method-detail">
|
203
|
+
<a name="M000066"></a>
|
204
|
+
|
205
|
+
<div class="method-heading">
|
206
|
+
<a href="#M000066" class="method-signature">
|
207
|
+
<span class="method-name">reset</span><span class="method-args">()</span>
|
208
|
+
</a>
|
209
|
+
</div>
|
210
|
+
|
211
|
+
<div class="method-description">
|
212
|
+
<p><a class="source-toggle" href="#"
|
213
|
+
onclick="toggleCode('M000066-source');return false;">[Source]</a></p>
|
214
|
+
<div class="method-source-code" id="M000066-source">
|
215
|
+
<pre>
|
216
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 12</span>
|
217
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
218
|
+
<span class="ruby-ivar">@freqs</span> = {}
|
219
|
+
<span class="ruby-ivar">@total_word_counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value">0</span>)
|
220
|
+
<span class="ruby-ivar">@total_doc_counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value">0</span>)
|
221
|
+
<span class="ruby-ivar">@klass_word_counts</span> = {}
|
222
|
+
<span class="ruby-ivar">@klass_doc_counts</span> = {}
|
223
|
+
<span class="ruby-keyword kw">end</span>
|
224
|
+
</pre>
|
225
|
+
</div>
|
226
|
+
</div>
|
227
|
+
</div>
|
228
|
+
|
229
|
+
<div id="method-M000069" class="method-detail">
|
230
|
+
<a name="M000069"></a>
|
231
|
+
|
232
|
+
<div class="method-heading">
|
233
|
+
<a href="#M000069" class="method-signature">
|
234
|
+
<span class="method-name">save</span><span class="method-args">(file = nil)</span>
|
235
|
+
</a>
|
236
|
+
</div>
|
237
|
+
|
238
|
+
<div class="method-description">
|
239
|
+
<p><a class="source-toggle" href="#"
|
240
|
+
onclick="toggleCode('M000069-source');return false;">[Source]</a></p>
|
241
|
+
<div class="method-source-code" id="M000069-source">
|
242
|
+
<pre>
|
243
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 41</span>
|
244
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">save</span>(<span class="ruby-identifier">file</span> = <span class="ruby-keyword kw">nil</span>)
|
245
|
+
<span class="ruby-identifier">file</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@file</span>
|
246
|
+
<span class="ruby-identifier">data</span> = { <span class="ruby-identifier">:freqs</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@freqs</span>,
|
247
|
+
<span class="ruby-identifier">:total_word_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@total_word_counts</span>,
|
248
|
+
<span class="ruby-identifier">:total_doc_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@total_doc_counts</span>,
|
249
|
+
<span class="ruby-identifier">:klass_word_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@klass_word_counts</span>,
|
250
|
+
<span class="ruby-identifier">:klass_doc_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@klass_doc_counts</span> }
|
251
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">open</span>(<span class="ruby-identifier">file</span>, <span class="ruby-value str">'w+'</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">f</span><span class="ruby-operator">|</span>
|
252
|
+
<span class="ruby-constant">Marshal</span>.<span class="ruby-identifier">dump</span>(<span class="ruby-identifier">data</span>, <span class="ruby-identifier">f</span>)
|
253
|
+
<span class="ruby-keyword kw">end</span>
|
254
|
+
<span class="ruby-keyword kw">end</span>
|
255
|
+
</pre>
|
256
|
+
</div>
|
257
|
+
</div>
|
258
|
+
</div>
|
259
|
+
|
260
|
+
|
261
|
+
</div>
|
262
|
+
|
263
|
+
|
264
|
+
</div>
|
265
|
+
|
266
|
+
|
267
|
+
<div id="validator-badges">
|
268
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
269
|
+
</div>
|
270
|
+
|
271
|
+
</body>
|
272
|
+
</html>
|
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Wed, 05 Jan 2011 17:44:50 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Wed Jan 05 17:43:40 -0500 2011</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -74,7 +74,8 @@
|
|
74
74
|
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
75
|
that can use either Hadoop‘s HBase or Cassandra for storage. Because
|
76
76
|
it uses HBase or Cassandra as a backend, the training corpus can be many
|
77
|
-
terabytes in size
|
77
|
+
terabytes in size (though additional memory and single file storage
|
78
|
+
abilities also exist for smaller corpora).
|
78
79
|
</p>
|
79
80
|
<p>
|
80
81
|
<a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
|
@@ -181,8 +182,8 @@ The API is the same as the NaiveBayesClassifier, except rather than calling
|
|
181
182
|
<h2>Storage Methods</h2>
|
182
183
|
<p>
|
183
184
|
<a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
|
184
|
-
interface that has been implemented for HBase, Cassandra, and
|
185
|
-
storage.
|
185
|
+
interface that has been implemented for HBase, Cassandra, single file, and
|
186
|
+
in-memory storage.
|
186
187
|
</p>
|
187
188
|
<p>
|
188
189
|
Memory storage can be used when you have a very small corpora
|
@@ -192,6 +193,20 @@ Memory storage can be used when you have a very small corpora
|
|
192
193
|
storage = Ankusa::MemoryStorage.new
|
193
194
|
</pre>
|
194
195
|
<p>
|
196
|
+
FileSystem storage can be used when you have a very small corpora and want
|
197
|
+
to persist the classification results.
|
198
|
+
</p>
|
199
|
+
<pre>
|
200
|
+
require 'ankusa/file_system_storage'
|
201
|
+
storage = Ankusa::FileSystemStorage.new '/path/to/file'
|
202
|
+
# Do classification ...
|
203
|
+
storage.save
|
204
|
+
</pre>
|
205
|
+
<p>
|
206
|
+
The FileSystem storage does NOT save to the filesystem automatically, the
|
207
|
+
save method must be invoked to save and persist the results
|
208
|
+
</p>
|
209
|
+
<p>
|
195
210
|
HBase storage:
|
196
211
|
</p>
|
197
212
|
<pre>
|
@@ -227,7 +242,7 @@ To use the Cassandra storage class:
|
|
227
242
|
</pre>
|
228
243
|
<h2>Running Tests</h2>
|
229
244
|
<p>
|
230
|
-
You can run the tests for any of the
|
245
|
+
You can run the tests for any of the four storage methods. For instance,
|
231
246
|
for memory storage:
|
232
247
|
</p>
|
233
248
|
<pre>
|
@@ -241,6 +256,8 @@ set the configuration params. Then:
|
|
241
256
|
rake test_hbase
|
242
257
|
# or
|
243
258
|
rake test_cassandra
|
259
|
+
# or
|
260
|
+
rake test_filesystem
|
244
261
|
</pre>
|
245
262
|
|
246
263
|
</div>
|
@@ -0,0 +1,108 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>File: file_system_storage.rb</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="fileHeader">
|
50
|
+
<h1>file_system_storage.rb</h1>
|
51
|
+
<table class="header-table">
|
52
|
+
<tr class="top-aligned-row">
|
53
|
+
<td><strong>Path:</strong></td>
|
54
|
+
<td>lib/ankusa/file_system_storage.rb
|
55
|
+
</td>
|
56
|
+
</tr>
|
57
|
+
<tr class="top-aligned-row">
|
58
|
+
<td><strong>Last Update:</strong></td>
|
59
|
+
<td>Wed Jan 05 17:37:03 -0500 2011</td>
|
60
|
+
</tr>
|
61
|
+
</table>
|
62
|
+
</div>
|
63
|
+
<!-- banner header -->
|
64
|
+
|
65
|
+
<div id="bodyContent">
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
<div id="contextContent">
|
70
|
+
|
71
|
+
|
72
|
+
<div id="requires-list">
|
73
|
+
<h3 class="section-bar">Required files</h3>
|
74
|
+
|
75
|
+
<div class="name-list">
|
76
|
+
ankusa/memory_storage
|
77
|
+
</div>
|
78
|
+
</div>
|
79
|
+
|
80
|
+
</div>
|
81
|
+
|
82
|
+
|
83
|
+
</div>
|
84
|
+
|
85
|
+
|
86
|
+
<!-- if includes -->
|
87
|
+
|
88
|
+
<div id="section">
|
89
|
+
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
<!-- if method_list -->
|
98
|
+
|
99
|
+
|
100
|
+
</div>
|
101
|
+
|
102
|
+
|
103
|
+
<div id="validator-badges">
|
104
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
105
|
+
</div>
|
106
|
+
|
107
|
+
</body>
|
108
|
+
</html>
|
data/docs/fr_class_index.html
CHANGED
@@ -23,6 +23,7 @@
|
|
23
23
|
<a href="classes/Ankusa.html">Ankusa</a><br />
|
24
24
|
<a href="classes/Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a><br />
|
25
25
|
<a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
|
26
|
+
<a href="classes/Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a><br />
|
26
27
|
<a href="classes/Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a><br />
|
27
28
|
<a href="classes/Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a><br />
|
28
29
|
<a href="classes/Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a><br />
|
data/docs/fr_file_index.html
CHANGED
@@ -25,6 +25,7 @@
|
|
25
25
|
<a href="files/lib/ankusa/cassandra_storage_rb.html">lib/ankusa/cassandra_storage.rb</a><br />
|
26
26
|
<a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
|
27
27
|
<a href="files/lib/ankusa/extensions_rb.html">lib/ankusa/extensions.rb</a><br />
|
28
|
+
<a href="files/lib/ankusa/file_system_storage_rb.html">lib/ankusa/file_system_storage.rb</a><br />
|
28
29
|
<a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
|
29
30
|
<a href="files/lib/ankusa/hbase_storage_rb.html">lib/ankusa/hbase_storage.rb</a><br />
|
30
31
|
<a href="files/lib/ankusa/kl_divergence_rb.html">lib/ankusa/kl_divergence.rb</a><br />
|
data/docs/fr_method_index.html
CHANGED
@@ -24,60 +24,65 @@
|
|
24
24
|
<a href="classes/Ankusa/TextHash.html#M000062">add_word (Ankusa::TextHash)</a><br />
|
25
25
|
<a href="classes/Ankusa/TextHash.html#M000063">atomize (Ankusa::TextHash)</a><br />
|
26
26
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000025">classifications (Ankusa::NaiveBayesClassifier)</a><br />
|
27
|
-
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000041">classify (Ankusa::KLDivergenceClassifier)</a><br />
|
28
27
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000024">classify (Ankusa::NaiveBayesClassifier)</a><br />
|
29
|
-
<a href="classes/Ankusa/
|
30
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000010">classnames (Ankusa::CassandraStorage)</a><br />
|
28
|
+
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000041">classify (Ankusa::KLDivergenceClassifier)</a><br />
|
31
29
|
<a href="classes/Ankusa/HBaseStorage.html#M000044">classnames (Ankusa::HBaseStorage)</a><br />
|
30
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000010">classnames (Ankusa::CassandraStorage)</a><br />
|
31
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000028">classnames (Ankusa::MemoryStorage)</a><br />
|
32
32
|
<a href="classes/Ankusa/MemoryStorage.html#M000040">close (Ankusa::MemoryStorage)</a><br />
|
33
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000056">close (Ankusa::HBaseStorage)</a><br />
|
34
33
|
<a href="classes/Ankusa/CassandraStorage.html#M000022">close (Ankusa::CassandraStorage)</a><br />
|
34
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000056">close (Ankusa::HBaseStorage)</a><br />
|
35
35
|
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000042">distances (Ankusa::KLDivergenceClassifier)</a><br />
|
36
36
|
<a href="classes/Ankusa/Classifier.html#M000007">doc_count_totals (Ankusa::Classifier)</a><br />
|
37
37
|
<a href="classes/Ankusa/CassandraStorage.html#M000021">doc_count_totals (Ankusa::CassandraStorage)</a><br />
|
38
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000039">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
39
38
|
<a href="classes/Ankusa/HBaseStorage.html#M000055">doc_count_totals (Ankusa::HBaseStorage)</a><br />
|
40
|
-
<a href="classes/Ankusa/
|
39
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000039">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
41
40
|
<a href="classes/Ankusa/HBaseStorage.html#M000046">drop_tables (Ankusa::HBaseStorage)</a><br />
|
41
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000012">drop_tables (Ankusa::CassandraStorage)</a><br />
|
42
42
|
<a href="classes/Ankusa/MemoryStorage.html#M000030">drop_tables (Ankusa::MemoryStorage)</a><br />
|
43
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000067">drop_tables (Ankusa::FileSystemStorage)</a><br />
|
43
44
|
<a href="classes/Ankusa/HBaseStorage.html#M000059">freq_table (Ankusa::HBaseStorage)</a><br />
|
44
45
|
<a href="classes/Ankusa/CassandraStorage.html#M000017">get_doc_count (Ankusa::CassandraStorage)</a><br />
|
45
46
|
<a href="classes/Ankusa/HBaseStorage.html#M000051">get_doc_count (Ankusa::HBaseStorage)</a><br />
|
46
47
|
<a href="classes/Ankusa/MemoryStorage.html#M000035">get_doc_count (Ankusa::MemoryStorage)</a><br />
|
47
48
|
<a href="classes/Ankusa/CassandraStorage.html#M000023">get_summary (Ankusa::CassandraStorage)</a><br />
|
48
49
|
<a href="classes/Ankusa/HBaseStorage.html#M000057">get_summary (Ankusa::HBaseStorage)</a><br />
|
49
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000034">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
50
50
|
<a href="classes/Ankusa/CassandraStorage.html#M000016">get_total_word_count (Ankusa::CassandraStorage)</a><br />
|
51
51
|
<a href="classes/Ankusa/HBaseStorage.html#M000050">get_total_word_count (Ankusa::HBaseStorage)</a><br />
|
52
|
-
<a href="classes/Ankusa/
|
53
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000015">get_vocabulary_sizes (Ankusa::CassandraStorage)</a><br />
|
52
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000034">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
54
53
|
<a href="classes/Ankusa/MemoryStorage.html#M000032">get_vocabulary_sizes (Ankusa::MemoryStorage)</a><br />
|
54
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000015">get_vocabulary_sizes (Ankusa::CassandraStorage)</a><br />
|
55
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000049">get_vocabulary_sizes (Ankusa::HBaseStorage)</a><br />
|
55
56
|
<a href="classes/Ankusa/MemoryStorage.html#M000033">get_word_counts (Ankusa::MemoryStorage)</a><br />
|
56
57
|
<a href="classes/Ankusa/CassandraStorage.html#M000014">get_word_counts (Ankusa::CassandraStorage)</a><br />
|
57
58
|
<a href="classes/Ankusa/HBaseStorage.html#M000048">get_word_counts (Ankusa::HBaseStorage)</a><br />
|
58
59
|
<a href="classes/Ankusa/Classifier.html#M000006">get_word_probs (Ankusa::Classifier)</a><br />
|
59
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000054">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
60
60
|
<a href="classes/Ankusa/MemoryStorage.html#M000038">incr_doc_count (Ankusa::MemoryStorage)</a><br />
|
61
61
|
<a href="classes/Ankusa/CassandraStorage.html#M000020">incr_doc_count (Ankusa::CassandraStorage)</a><br />
|
62
|
-
<a href="classes/Ankusa/
|
62
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000054">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
63
63
|
<a href="classes/Ankusa/HBaseStorage.html#M000053">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
|
64
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000019">incr_total_word_count (Ankusa::CassandraStorage)</a><br />
|
64
65
|
<a href="classes/Ankusa/MemoryStorage.html#M000037">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
|
65
66
|
<a href="classes/Ankusa/MemoryStorage.html#M000036">incr_word_count (Ankusa::MemoryStorage)</a><br />
|
66
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000018">incr_word_count (Ankusa::CassandraStorage)</a><br />
|
67
67
|
<a href="classes/Ankusa/HBaseStorage.html#M000052">incr_word_count (Ankusa::HBaseStorage)</a><br />
|
68
|
-
<a href="classes/Ankusa/CassandraStorage.html#
|
69
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000047">init_tables (Ankusa::HBaseStorage)</a><br />
|
68
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000018">incr_word_count (Ankusa::CassandraStorage)</a><br />
|
70
69
|
<a href="classes/Ankusa/MemoryStorage.html#M000031">init_tables (Ankusa::MemoryStorage)</a><br />
|
70
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000047">init_tables (Ankusa::HBaseStorage)</a><br />
|
71
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000013">init_tables (Ankusa::CassandraStorage)</a><br />
|
72
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000068">init_tables (Ankusa::FileSystemStorage)</a><br />
|
71
73
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000026">log_likelihoods (Ankusa::NaiveBayesClassifier)</a><br />
|
74
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000043">new (Ankusa::HBaseStorage)</a><br />
|
75
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000009">new (Ankusa::CassandraStorage)</a><br />
|
72
76
|
<a href="classes/Ankusa/TextHash.html#M000060">new (Ankusa::TextHash)</a><br />
|
73
77
|
<a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
|
74
|
-
<a href="classes/Ankusa/
|
78
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000065">new (Ankusa::FileSystemStorage)</a><br />
|
75
79
|
<a href="classes/Ankusa/MemoryStorage.html#M000027">new (Ankusa::MemoryStorage)</a><br />
|
76
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000043">new (Ankusa::HBaseStorage)</a><br />
|
77
80
|
<a href="classes/String.html#M000001">numeric? (String)</a><br />
|
81
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000066">reset (Ankusa::FileSystemStorage)</a><br />
|
78
82
|
<a href="classes/Ankusa/HBaseStorage.html#M000045">reset (Ankusa::HBaseStorage)</a><br />
|
79
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000029">reset (Ankusa::MemoryStorage)</a><br />
|
80
83
|
<a href="classes/Ankusa/CassandraStorage.html#M000011">reset (Ankusa::CassandraStorage)</a><br />
|
84
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000029">reset (Ankusa::MemoryStorage)</a><br />
|
85
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000069">save (Ankusa::FileSystemStorage)</a><br />
|
81
86
|
<a href="classes/Ankusa/HBaseStorage.html#M000058">summary_table (Ankusa::HBaseStorage)</a><br />
|
82
87
|
<a href="classes/String.html#M000002">to_ascii (String)</a><br />
|
83
88
|
<a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'ankusa/memory_storage'
|
2
|
+
|
3
|
+
module Ankusa
|
4
|
+
|
5
|
+
class FileSystemStorage < MemoryStorage
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
init_tables
|
10
|
+
end
|
11
|
+
|
12
|
+
def reset
|
13
|
+
@freqs = {}
|
14
|
+
@total_word_counts = Hash.new(0)
|
15
|
+
@total_doc_counts = Hash.new(0)
|
16
|
+
@klass_word_counts = {}
|
17
|
+
@klass_doc_counts = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def drop_tables
|
21
|
+
File.delete(@file) rescue Errno::ENOENT
|
22
|
+
reset
|
23
|
+
end
|
24
|
+
|
25
|
+
def init_tables
|
26
|
+
data = {}
|
27
|
+
begin
|
28
|
+
File.open(@file) do |f|
|
29
|
+
data = Marshal.load(f)
|
30
|
+
end
|
31
|
+
@freqs = data[:freqs]
|
32
|
+
@total_word_counts = data[:total_word_counts]
|
33
|
+
@total_doc_counts = data[:total_doc_counts]
|
34
|
+
@klass_word_counts = data[:klass_word_counts]
|
35
|
+
@klass_doc_counts = data[:klass_word_counts]
|
36
|
+
rescue Errno::ENOENT
|
37
|
+
reset
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def save(file = nil)
|
42
|
+
file ||= @file
|
43
|
+
data = { :freqs => @freqs,
|
44
|
+
:total_word_counts => @total_word_counts,
|
45
|
+
:total_doc_counts => @total_doc_counts,
|
46
|
+
:klass_word_counts => @klass_word_counts,
|
47
|
+
:klass_doc_counts => @klass_doc_counts }
|
48
|
+
File.open(file, 'w+') do |f|
|
49
|
+
Marshal.dump(data, f)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 8
|
10
|
+
version: 0.0.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- lib/ankusa/cassandra_storage.rb
|
47
47
|
- lib/ankusa/classifier.rb
|
48
48
|
- lib/ankusa/extensions.rb
|
49
|
+
- lib/ankusa/file_system_storage.rb
|
49
50
|
- lib/ankusa/hasher.rb
|
50
51
|
- lib/ankusa/hbase_storage.rb
|
51
52
|
- lib/ankusa/kl_divergence.rb
|
@@ -58,6 +59,7 @@ files:
|
|
58
59
|
- README.rdoc
|
59
60
|
- docs/classes/Ankusa/CassandraStorage.html
|
60
61
|
- docs/classes/Ankusa/Classifier.html
|
62
|
+
- docs/classes/Ankusa/FileSystemStorage.html
|
61
63
|
- docs/classes/Ankusa/HBaseStorage.html
|
62
64
|
- docs/classes/Ankusa/KLDivergenceClassifier.html
|
63
65
|
- docs/classes/Ankusa/MemoryStorage.html
|
@@ -69,6 +71,7 @@ files:
|
|
69
71
|
- docs/files/lib/ankusa/cassandra_storage_rb.html
|
70
72
|
- docs/files/lib/ankusa/classifier_rb.html
|
71
73
|
- docs/files/lib/ankusa/extensions_rb.html
|
74
|
+
- docs/files/lib/ankusa/file_system_storage_rb.html
|
72
75
|
- docs/files/lib/ankusa/hasher_rb.html
|
73
76
|
- docs/files/lib/ankusa/hbase_storage_rb.html
|
74
77
|
- docs/files/lib/ankusa/kl_divergence_rb.html
|