ankusa 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +13 -3
- data/Rakefile +9 -2
- data/docs/classes/Ankusa.html +5 -0
- data/docs/classes/Ankusa/FileSystemStorage.html +272 -0
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +22 -5
- data/docs/files/lib/ankusa/file_system_storage_rb.html +108 -0
- data/docs/fr_class_index.html +1 -0
- data/docs/fr_file_index.html +1 -0
- data/docs/fr_method_index.html +22 -17
- data/lib/ankusa/file_system_storage.rb +55 -0
- metadata +7 -4
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= ankusa
|
2
2
|
|
3
|
-
Ankusa is a text classifier in Ruby that can use either Hadoop's HBase or Cassandra for storage. Because it uses HBase or Cassandra as a backend, the training corpus can be many terabytes in size.
|
3
|
+
Ankusa is a text classifier in Ruby that can use either Hadoop's HBase or Cassandra for storage. Because it uses HBase or Cassandra as a backend, the training corpus can be many terabytes in size (though additional memory and single file storage abilities also exist for smaller corpora).
|
4
4
|
|
5
5
|
Ankusa currently provides both a Naive Bayes and Kullback-Leibler divergence classifier. It ignores common words (a.k.a, stop words) and stems all others. Additionally, it uses Laplacian smoothing in both classification methods.
|
6
6
|
|
@@ -80,12 +80,20 @@ The API is the same as the NaiveBayesClassifier, except rather than calling "cla
|
|
80
80
|
storage.close
|
81
81
|
|
82
82
|
== Storage Methods
|
83
|
-
Ankusa has a generalized storage interface that has been implemented for HBase, Cassandra, and in-memory storage.
|
83
|
+
Ankusa has a generalized storage interface that has been implemented for HBase, Cassandra, single file, and in-memory storage.
|
84
84
|
|
85
85
|
Memory storage can be used when you have a very small corpora
|
86
86
|
require 'ankusa/memory_storage'
|
87
87
|
storage = Ankusa::MemoryStorage.new
|
88
88
|
|
89
|
+
FileSystem storage can be used when you have a very small corpora and want to persist the classification results.
|
90
|
+
require 'ankusa/file_system_storage'
|
91
|
+
storage = Ankusa::FileSystemStorage.new '/path/to/file'
|
92
|
+
# Do classification ...
|
93
|
+
storage.save
|
94
|
+
|
95
|
+
The FileSystem storage does NOT save to the filesystem automatically, the #save method must be invoked to save and persist the results
|
96
|
+
|
89
97
|
HBase storage:
|
90
98
|
require 'ankusa/hbase_storage'
|
91
99
|
# defaults: host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary"
|
@@ -103,13 +111,15 @@ To use the Cassandra storage class:
|
|
103
111
|
|
104
112
|
|
105
113
|
== Running Tests
|
106
|
-
You can run the tests for any of the
|
114
|
+
You can run the tests for any of the four storage methods. For instance, for memory storage:
|
107
115
|
rake test_memory
|
108
116
|
|
109
117
|
For the other methods you will need to edit the file test/config.yml and set the configuration params. Then:
|
110
118
|
rake test_hbase
|
111
119
|
# or
|
112
120
|
rake test_cassandra
|
121
|
+
# or
|
122
|
+
rake test_filesystem
|
113
123
|
|
114
124
|
|
115
125
|
|
data/Rakefile
CHANGED
@@ -33,11 +33,18 @@ Rake::TestTask.new("test_cassandra") { |t|
|
|
33
33
|
t.verbose = true
|
34
34
|
}
|
35
35
|
|
36
|
+
desc "Run all unit tests with FileSystem storage"
|
37
|
+
Rake::TestTask.new("test_filesystem") { |t|
|
38
|
+
t.libs << "lib"
|
39
|
+
t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
|
40
|
+
t.verbose = true
|
41
|
+
}
|
42
|
+
|
36
43
|
spec = Gem::Specification.new do |s|
|
37
44
|
s.name = "ankusa"
|
38
|
-
s.version = "0.0.
|
45
|
+
s.version = "0.0.8"
|
39
46
|
s.authors = ["Brian Muller"]
|
40
|
-
s.date = %q{
|
47
|
+
s.date = %q{2011-01-05}
|
41
48
|
s.description = "Text classifier with HBase or Cassandra storage"
|
42
49
|
s.summary = "Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage"
|
43
50
|
s.email = "brian.muller@livingsocial.com"
|
data/docs/classes/Ankusa.html
CHANGED
@@ -62,6 +62,10 @@
|
|
62
62
|
<a href="../files/lib/ankusa/classifier_rb.html">
|
63
63
|
lib/ankusa/classifier.rb
|
64
64
|
</a>
|
65
|
+
<br />
|
66
|
+
<a href="../files/lib/ankusa/file_system_storage_rb.html">
|
67
|
+
lib/ankusa/file_system_storage.rb
|
68
|
+
</a>
|
65
69
|
<br />
|
66
70
|
<a href="../files/lib/ankusa/hasher_rb.html">
|
67
71
|
lib/ankusa/hasher.rb
|
@@ -130,6 +134,7 @@ cassandra
|
|
130
134
|
|
131
135
|
Module <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
|
132
136
|
Class <a href="Ankusa/CassandraStorage.html" class="link">Ankusa::CassandraStorage</a><br />
|
137
|
+
Class <a href="Ankusa/FileSystemStorage.html" class="link">Ankusa::FileSystemStorage</a><br />
|
133
138
|
Class <a href="Ankusa/HBaseStorage.html" class="link">Ankusa::HBaseStorage</a><br />
|
134
139
|
Class <a href="Ankusa/KLDivergenceClassifier.html" class="link">Ankusa::KLDivergenceClassifier</a><br />
|
135
140
|
Class <a href="Ankusa/MemoryStorage.html" class="link">Ankusa::MemoryStorage</a><br />
|
@@ -0,0 +1,272 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: Ankusa::FileSystemStorage</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">Ankusa::FileSystemStorage</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../../files/lib/ankusa/file_system_storage_rb.html">
|
59
|
+
lib/ankusa/file_system_storage.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
<a href="MemoryStorage.html">
|
69
|
+
MemoryStorage
|
70
|
+
</a>
|
71
|
+
</td>
|
72
|
+
</tr>
|
73
|
+
</table>
|
74
|
+
</div>
|
75
|
+
<!-- banner header -->
|
76
|
+
|
77
|
+
<div id="bodyContent">
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
<div id="contextContent">
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
</div>
|
86
|
+
|
87
|
+
<div id="method-list">
|
88
|
+
<h3 class="section-bar">Methods</h3>
|
89
|
+
|
90
|
+
<div class="name-list">
|
91
|
+
<a href="#M000067">drop_tables</a>
|
92
|
+
<a href="#M000068">init_tables</a>
|
93
|
+
<a href="#M000065">new</a>
|
94
|
+
<a href="#M000066">reset</a>
|
95
|
+
<a href="#M000069">save</a>
|
96
|
+
</div>
|
97
|
+
</div>
|
98
|
+
|
99
|
+
</div>
|
100
|
+
|
101
|
+
|
102
|
+
<!-- if includes -->
|
103
|
+
|
104
|
+
<div id="section">
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
<!-- if method_list -->
|
114
|
+
<div id="methods">
|
115
|
+
<h3 class="section-bar">Public Class methods</h3>
|
116
|
+
|
117
|
+
<div id="method-M000065" class="method-detail">
|
118
|
+
<a name="M000065"></a>
|
119
|
+
|
120
|
+
<div class="method-heading">
|
121
|
+
<a href="#M000065" class="method-signature">
|
122
|
+
<span class="method-name">new</span><span class="method-args">(file)</span>
|
123
|
+
</a>
|
124
|
+
</div>
|
125
|
+
|
126
|
+
<div class="method-description">
|
127
|
+
<p><a class="source-toggle" href="#"
|
128
|
+
onclick="toggleCode('M000065-source');return false;">[Source]</a></p>
|
129
|
+
<div class="method-source-code" id="M000065-source">
|
130
|
+
<pre>
|
131
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 7</span>
|
132
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">file</span>)
|
133
|
+
<span class="ruby-ivar">@file</span> = <span class="ruby-identifier">file</span>
|
134
|
+
<span class="ruby-identifier">init_tables</span>
|
135
|
+
<span class="ruby-keyword kw">end</span>
|
136
|
+
</pre>
|
137
|
+
</div>
|
138
|
+
</div>
|
139
|
+
</div>
|
140
|
+
|
141
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
142
|
+
|
143
|
+
<div id="method-M000067" class="method-detail">
|
144
|
+
<a name="M000067"></a>
|
145
|
+
|
146
|
+
<div class="method-heading">
|
147
|
+
<a href="#M000067" class="method-signature">
|
148
|
+
<span class="method-name">drop_tables</span><span class="method-args">()</span>
|
149
|
+
</a>
|
150
|
+
</div>
|
151
|
+
|
152
|
+
<div class="method-description">
|
153
|
+
<p><a class="source-toggle" href="#"
|
154
|
+
onclick="toggleCode('M000067-source');return false;">[Source]</a></p>
|
155
|
+
<div class="method-source-code" id="M000067-source">
|
156
|
+
<pre>
|
157
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 20</span>
|
158
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
|
159
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">delete</span>(<span class="ruby-ivar">@file</span>) <span class="ruby-keyword kw">rescue</span> <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">ENOENT</span>
|
160
|
+
<span class="ruby-identifier">reset</span>
|
161
|
+
<span class="ruby-keyword kw">end</span>
|
162
|
+
</pre>
|
163
|
+
</div>
|
164
|
+
</div>
|
165
|
+
</div>
|
166
|
+
|
167
|
+
<div id="method-M000068" class="method-detail">
|
168
|
+
<a name="M000068"></a>
|
169
|
+
|
170
|
+
<div class="method-heading">
|
171
|
+
<a href="#M000068" class="method-signature">
|
172
|
+
<span class="method-name">init_tables</span><span class="method-args">()</span>
|
173
|
+
</a>
|
174
|
+
</div>
|
175
|
+
|
176
|
+
<div class="method-description">
|
177
|
+
<p><a class="source-toggle" href="#"
|
178
|
+
onclick="toggleCode('M000068-source');return false;">[Source]</a></p>
|
179
|
+
<div class="method-source-code" id="M000068-source">
|
180
|
+
<pre>
|
181
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 25</span>
|
182
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
|
183
|
+
<span class="ruby-identifier">data</span> = {}
|
184
|
+
<span class="ruby-keyword kw">begin</span>
|
185
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">open</span>(<span class="ruby-ivar">@file</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">f</span><span class="ruby-operator">|</span>
|
186
|
+
<span class="ruby-identifier">data</span> = <span class="ruby-constant">Marshal</span>.<span class="ruby-identifier">load</span>(<span class="ruby-identifier">f</span>)
|
187
|
+
<span class="ruby-keyword kw">end</span>
|
188
|
+
<span class="ruby-ivar">@freqs</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:freqs</span>]
|
189
|
+
<span class="ruby-ivar">@total_word_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:total_word_counts</span>]
|
190
|
+
<span class="ruby-ivar">@total_doc_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:total_doc_counts</span>]
|
191
|
+
<span class="ruby-ivar">@klass_word_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:klass_word_counts</span>]
|
192
|
+
<span class="ruby-ivar">@klass_doc_counts</span> = <span class="ruby-identifier">data</span>[<span class="ruby-identifier">:klass_word_counts</span>]
|
193
|
+
<span class="ruby-keyword kw">rescue</span> <span class="ruby-constant">Errno</span><span class="ruby-operator">::</span><span class="ruby-constant">ENOENT</span>
|
194
|
+
<span class="ruby-identifier">reset</span>
|
195
|
+
<span class="ruby-keyword kw">end</span>
|
196
|
+
<span class="ruby-keyword kw">end</span>
|
197
|
+
</pre>
|
198
|
+
</div>
|
199
|
+
</div>
|
200
|
+
</div>
|
201
|
+
|
202
|
+
<div id="method-M000066" class="method-detail">
|
203
|
+
<a name="M000066"></a>
|
204
|
+
|
205
|
+
<div class="method-heading">
|
206
|
+
<a href="#M000066" class="method-signature">
|
207
|
+
<span class="method-name">reset</span><span class="method-args">()</span>
|
208
|
+
</a>
|
209
|
+
</div>
|
210
|
+
|
211
|
+
<div class="method-description">
|
212
|
+
<p><a class="source-toggle" href="#"
|
213
|
+
onclick="toggleCode('M000066-source');return false;">[Source]</a></p>
|
214
|
+
<div class="method-source-code" id="M000066-source">
|
215
|
+
<pre>
|
216
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 12</span>
|
217
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
|
218
|
+
<span class="ruby-ivar">@freqs</span> = {}
|
219
|
+
<span class="ruby-ivar">@total_word_counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value">0</span>)
|
220
|
+
<span class="ruby-ivar">@total_doc_counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value">0</span>)
|
221
|
+
<span class="ruby-ivar">@klass_word_counts</span> = {}
|
222
|
+
<span class="ruby-ivar">@klass_doc_counts</span> = {}
|
223
|
+
<span class="ruby-keyword kw">end</span>
|
224
|
+
</pre>
|
225
|
+
</div>
|
226
|
+
</div>
|
227
|
+
</div>
|
228
|
+
|
229
|
+
<div id="method-M000069" class="method-detail">
|
230
|
+
<a name="M000069"></a>
|
231
|
+
|
232
|
+
<div class="method-heading">
|
233
|
+
<a href="#M000069" class="method-signature">
|
234
|
+
<span class="method-name">save</span><span class="method-args">(file = nil)</span>
|
235
|
+
</a>
|
236
|
+
</div>
|
237
|
+
|
238
|
+
<div class="method-description">
|
239
|
+
<p><a class="source-toggle" href="#"
|
240
|
+
onclick="toggleCode('M000069-source');return false;">[Source]</a></p>
|
241
|
+
<div class="method-source-code" id="M000069-source">
|
242
|
+
<pre>
|
243
|
+
<span class="ruby-comment cmt"># File lib/ankusa/file_system_storage.rb, line 41</span>
|
244
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">save</span>(<span class="ruby-identifier">file</span> = <span class="ruby-keyword kw">nil</span>)
|
245
|
+
<span class="ruby-identifier">file</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@file</span>
|
246
|
+
<span class="ruby-identifier">data</span> = { <span class="ruby-identifier">:freqs</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@freqs</span>,
|
247
|
+
<span class="ruby-identifier">:total_word_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@total_word_counts</span>,
|
248
|
+
<span class="ruby-identifier">:total_doc_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@total_doc_counts</span>,
|
249
|
+
<span class="ruby-identifier">:klass_word_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@klass_word_counts</span>,
|
250
|
+
<span class="ruby-identifier">:klass_doc_counts</span> =<span class="ruby-operator">></span> <span class="ruby-ivar">@klass_doc_counts</span> }
|
251
|
+
<span class="ruby-constant">File</span>.<span class="ruby-identifier">open</span>(<span class="ruby-identifier">file</span>, <span class="ruby-value str">'w+'</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">f</span><span class="ruby-operator">|</span>
|
252
|
+
<span class="ruby-constant">Marshal</span>.<span class="ruby-identifier">dump</span>(<span class="ruby-identifier">data</span>, <span class="ruby-identifier">f</span>)
|
253
|
+
<span class="ruby-keyword kw">end</span>
|
254
|
+
<span class="ruby-keyword kw">end</span>
|
255
|
+
</pre>
|
256
|
+
</div>
|
257
|
+
</div>
|
258
|
+
</div>
|
259
|
+
|
260
|
+
|
261
|
+
</div>
|
262
|
+
|
263
|
+
|
264
|
+
</div>
|
265
|
+
|
266
|
+
|
267
|
+
<div id="validator-badges">
|
268
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
269
|
+
</div>
|
270
|
+
|
271
|
+
</body>
|
272
|
+
</html>
|
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Wed, 05 Jan 2011 17:44:50 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Wed Jan 05 17:43:40 -0500 2011</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -74,7 +74,8 @@
|
|
74
74
|
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
75
|
that can use either Hadoop‘s HBase or Cassandra for storage. Because
|
76
76
|
it uses HBase or Cassandra as a backend, the training corpus can be many
|
77
|
-
terabytes in size
|
77
|
+
terabytes in size (though additional memory and single file storage
|
78
|
+
abilities also exist for smaller corpora).
|
78
79
|
</p>
|
79
80
|
<p>
|
80
81
|
<a href="../classes/Ankusa.html">Ankusa</a> currently provides both a Naive
|
@@ -181,8 +182,8 @@ The API is the same as the NaiveBayesClassifier, except rather than calling
|
|
181
182
|
<h2>Storage Methods</h2>
|
182
183
|
<p>
|
183
184
|
<a href="../classes/Ankusa.html">Ankusa</a> has a generalized storage
|
184
|
-
interface that has been implemented for HBase, Cassandra, and
|
185
|
-
storage.
|
185
|
+
interface that has been implemented for HBase, Cassandra, single file, and
|
186
|
+
in-memory storage.
|
186
187
|
</p>
|
187
188
|
<p>
|
188
189
|
Memory storage can be used when you have a very small corpora
|
@@ -192,6 +193,20 @@ Memory storage can be used when you have a very small corpora
|
|
192
193
|
storage = Ankusa::MemoryStorage.new
|
193
194
|
</pre>
|
194
195
|
<p>
|
196
|
+
FileSystem storage can be used when you have a very small corpora and want
|
197
|
+
to persist the classification results.
|
198
|
+
</p>
|
199
|
+
<pre>
|
200
|
+
require 'ankusa/file_system_storage'
|
201
|
+
storage = Ankusa::FileSystemStorage.new '/path/to/file'
|
202
|
+
# Do classification ...
|
203
|
+
storage.save
|
204
|
+
</pre>
|
205
|
+
<p>
|
206
|
+
The FileSystem storage does NOT save to the filesystem automatically, the
|
207
|
+
save method must be invoked to save and persist the results
|
208
|
+
</p>
|
209
|
+
<p>
|
195
210
|
HBase storage:
|
196
211
|
</p>
|
197
212
|
<pre>
|
@@ -227,7 +242,7 @@ To use the Cassandra storage class:
|
|
227
242
|
</pre>
|
228
243
|
<h2>Running Tests</h2>
|
229
244
|
<p>
|
230
|
-
You can run the tests for any of the
|
245
|
+
You can run the tests for any of the four storage methods. For instance,
|
231
246
|
for memory storage:
|
232
247
|
</p>
|
233
248
|
<pre>
|
@@ -241,6 +256,8 @@ set the configuration params. Then:
|
|
241
256
|
rake test_hbase
|
242
257
|
# or
|
243
258
|
rake test_cassandra
|
259
|
+
# or
|
260
|
+
rake test_filesystem
|
244
261
|
</pre>
|
245
262
|
|
246
263
|
</div>
|
@@ -0,0 +1,108 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>File: file_system_storage.rb</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="fileHeader">
|
50
|
+
<h1>file_system_storage.rb</h1>
|
51
|
+
<table class="header-table">
|
52
|
+
<tr class="top-aligned-row">
|
53
|
+
<td><strong>Path:</strong></td>
|
54
|
+
<td>lib/ankusa/file_system_storage.rb
|
55
|
+
</td>
|
56
|
+
</tr>
|
57
|
+
<tr class="top-aligned-row">
|
58
|
+
<td><strong>Last Update:</strong></td>
|
59
|
+
<td>Wed Jan 05 17:37:03 -0500 2011</td>
|
60
|
+
</tr>
|
61
|
+
</table>
|
62
|
+
</div>
|
63
|
+
<!-- banner header -->
|
64
|
+
|
65
|
+
<div id="bodyContent">
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
<div id="contextContent">
|
70
|
+
|
71
|
+
|
72
|
+
<div id="requires-list">
|
73
|
+
<h3 class="section-bar">Required files</h3>
|
74
|
+
|
75
|
+
<div class="name-list">
|
76
|
+
ankusa/memory_storage
|
77
|
+
</div>
|
78
|
+
</div>
|
79
|
+
|
80
|
+
</div>
|
81
|
+
|
82
|
+
|
83
|
+
</div>
|
84
|
+
|
85
|
+
|
86
|
+
<!-- if includes -->
|
87
|
+
|
88
|
+
<div id="section">
|
89
|
+
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
<!-- if method_list -->
|
98
|
+
|
99
|
+
|
100
|
+
</div>
|
101
|
+
|
102
|
+
|
103
|
+
<div id="validator-badges">
|
104
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
105
|
+
</div>
|
106
|
+
|
107
|
+
</body>
|
108
|
+
</html>
|
data/docs/fr_class_index.html
CHANGED
@@ -23,6 +23,7 @@
|
|
23
23
|
<a href="classes/Ankusa.html">Ankusa</a><br />
|
24
24
|
<a href="classes/Ankusa/CassandraStorage.html">Ankusa::CassandraStorage</a><br />
|
25
25
|
<a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
|
26
|
+
<a href="classes/Ankusa/FileSystemStorage.html">Ankusa::FileSystemStorage</a><br />
|
26
27
|
<a href="classes/Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a><br />
|
27
28
|
<a href="classes/Ankusa/KLDivergenceClassifier.html">Ankusa::KLDivergenceClassifier</a><br />
|
28
29
|
<a href="classes/Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a><br />
|
data/docs/fr_file_index.html
CHANGED
@@ -25,6 +25,7 @@
|
|
25
25
|
<a href="files/lib/ankusa/cassandra_storage_rb.html">lib/ankusa/cassandra_storage.rb</a><br />
|
26
26
|
<a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
|
27
27
|
<a href="files/lib/ankusa/extensions_rb.html">lib/ankusa/extensions.rb</a><br />
|
28
|
+
<a href="files/lib/ankusa/file_system_storage_rb.html">lib/ankusa/file_system_storage.rb</a><br />
|
28
29
|
<a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
|
29
30
|
<a href="files/lib/ankusa/hbase_storage_rb.html">lib/ankusa/hbase_storage.rb</a><br />
|
30
31
|
<a href="files/lib/ankusa/kl_divergence_rb.html">lib/ankusa/kl_divergence.rb</a><br />
|
data/docs/fr_method_index.html
CHANGED
@@ -24,60 +24,65 @@
|
|
24
24
|
<a href="classes/Ankusa/TextHash.html#M000062">add_word (Ankusa::TextHash)</a><br />
|
25
25
|
<a href="classes/Ankusa/TextHash.html#M000063">atomize (Ankusa::TextHash)</a><br />
|
26
26
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000025">classifications (Ankusa::NaiveBayesClassifier)</a><br />
|
27
|
-
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000041">classify (Ankusa::KLDivergenceClassifier)</a><br />
|
28
27
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000024">classify (Ankusa::NaiveBayesClassifier)</a><br />
|
29
|
-
<a href="classes/Ankusa/
|
30
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000010">classnames (Ankusa::CassandraStorage)</a><br />
|
28
|
+
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000041">classify (Ankusa::KLDivergenceClassifier)</a><br />
|
31
29
|
<a href="classes/Ankusa/HBaseStorage.html#M000044">classnames (Ankusa::HBaseStorage)</a><br />
|
30
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000010">classnames (Ankusa::CassandraStorage)</a><br />
|
31
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000028">classnames (Ankusa::MemoryStorage)</a><br />
|
32
32
|
<a href="classes/Ankusa/MemoryStorage.html#M000040">close (Ankusa::MemoryStorage)</a><br />
|
33
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000056">close (Ankusa::HBaseStorage)</a><br />
|
34
33
|
<a href="classes/Ankusa/CassandraStorage.html#M000022">close (Ankusa::CassandraStorage)</a><br />
|
34
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000056">close (Ankusa::HBaseStorage)</a><br />
|
35
35
|
<a href="classes/Ankusa/KLDivergenceClassifier.html#M000042">distances (Ankusa::KLDivergenceClassifier)</a><br />
|
36
36
|
<a href="classes/Ankusa/Classifier.html#M000007">doc_count_totals (Ankusa::Classifier)</a><br />
|
37
37
|
<a href="classes/Ankusa/CassandraStorage.html#M000021">doc_count_totals (Ankusa::CassandraStorage)</a><br />
|
38
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000039">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
39
38
|
<a href="classes/Ankusa/HBaseStorage.html#M000055">doc_count_totals (Ankusa::HBaseStorage)</a><br />
|
40
|
-
<a href="classes/Ankusa/
|
39
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000039">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
41
40
|
<a href="classes/Ankusa/HBaseStorage.html#M000046">drop_tables (Ankusa::HBaseStorage)</a><br />
|
41
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000012">drop_tables (Ankusa::CassandraStorage)</a><br />
|
42
42
|
<a href="classes/Ankusa/MemoryStorage.html#M000030">drop_tables (Ankusa::MemoryStorage)</a><br />
|
43
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000067">drop_tables (Ankusa::FileSystemStorage)</a><br />
|
43
44
|
<a href="classes/Ankusa/HBaseStorage.html#M000059">freq_table (Ankusa::HBaseStorage)</a><br />
|
44
45
|
<a href="classes/Ankusa/CassandraStorage.html#M000017">get_doc_count (Ankusa::CassandraStorage)</a><br />
|
45
46
|
<a href="classes/Ankusa/HBaseStorage.html#M000051">get_doc_count (Ankusa::HBaseStorage)</a><br />
|
46
47
|
<a href="classes/Ankusa/MemoryStorage.html#M000035">get_doc_count (Ankusa::MemoryStorage)</a><br />
|
47
48
|
<a href="classes/Ankusa/CassandraStorage.html#M000023">get_summary (Ankusa::CassandraStorage)</a><br />
|
48
49
|
<a href="classes/Ankusa/HBaseStorage.html#M000057">get_summary (Ankusa::HBaseStorage)</a><br />
|
49
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000034">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
50
50
|
<a href="classes/Ankusa/CassandraStorage.html#M000016">get_total_word_count (Ankusa::CassandraStorage)</a><br />
|
51
51
|
<a href="classes/Ankusa/HBaseStorage.html#M000050">get_total_word_count (Ankusa::HBaseStorage)</a><br />
|
52
|
-
<a href="classes/Ankusa/
|
53
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000015">get_vocabulary_sizes (Ankusa::CassandraStorage)</a><br />
|
52
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000034">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
54
53
|
<a href="classes/Ankusa/MemoryStorage.html#M000032">get_vocabulary_sizes (Ankusa::MemoryStorage)</a><br />
|
54
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000015">get_vocabulary_sizes (Ankusa::CassandraStorage)</a><br />
|
55
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000049">get_vocabulary_sizes (Ankusa::HBaseStorage)</a><br />
|
55
56
|
<a href="classes/Ankusa/MemoryStorage.html#M000033">get_word_counts (Ankusa::MemoryStorage)</a><br />
|
56
57
|
<a href="classes/Ankusa/CassandraStorage.html#M000014">get_word_counts (Ankusa::CassandraStorage)</a><br />
|
57
58
|
<a href="classes/Ankusa/HBaseStorage.html#M000048">get_word_counts (Ankusa::HBaseStorage)</a><br />
|
58
59
|
<a href="classes/Ankusa/Classifier.html#M000006">get_word_probs (Ankusa::Classifier)</a><br />
|
59
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000054">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
60
60
|
<a href="classes/Ankusa/MemoryStorage.html#M000038">incr_doc_count (Ankusa::MemoryStorage)</a><br />
|
61
61
|
<a href="classes/Ankusa/CassandraStorage.html#M000020">incr_doc_count (Ankusa::CassandraStorage)</a><br />
|
62
|
-
<a href="classes/Ankusa/
|
62
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000054">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
63
63
|
<a href="classes/Ankusa/HBaseStorage.html#M000053">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
|
64
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000019">incr_total_word_count (Ankusa::CassandraStorage)</a><br />
|
64
65
|
<a href="classes/Ankusa/MemoryStorage.html#M000037">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
|
65
66
|
<a href="classes/Ankusa/MemoryStorage.html#M000036">incr_word_count (Ankusa::MemoryStorage)</a><br />
|
66
|
-
<a href="classes/Ankusa/CassandraStorage.html#M000018">incr_word_count (Ankusa::CassandraStorage)</a><br />
|
67
67
|
<a href="classes/Ankusa/HBaseStorage.html#M000052">incr_word_count (Ankusa::HBaseStorage)</a><br />
|
68
|
-
<a href="classes/Ankusa/CassandraStorage.html#
|
69
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000047">init_tables (Ankusa::HBaseStorage)</a><br />
|
68
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000018">incr_word_count (Ankusa::CassandraStorage)</a><br />
|
70
69
|
<a href="classes/Ankusa/MemoryStorage.html#M000031">init_tables (Ankusa::MemoryStorage)</a><br />
|
70
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000047">init_tables (Ankusa::HBaseStorage)</a><br />
|
71
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000013">init_tables (Ankusa::CassandraStorage)</a><br />
|
72
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000068">init_tables (Ankusa::FileSystemStorage)</a><br />
|
71
73
|
<a href="classes/Ankusa/NaiveBayesClassifier.html#M000026">log_likelihoods (Ankusa::NaiveBayesClassifier)</a><br />
|
74
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000043">new (Ankusa::HBaseStorage)</a><br />
|
75
|
+
<a href="classes/Ankusa/CassandraStorage.html#M000009">new (Ankusa::CassandraStorage)</a><br />
|
72
76
|
<a href="classes/Ankusa/TextHash.html#M000060">new (Ankusa::TextHash)</a><br />
|
73
77
|
<a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
|
74
|
-
<a href="classes/Ankusa/
|
78
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000065">new (Ankusa::FileSystemStorage)</a><br />
|
75
79
|
<a href="classes/Ankusa/MemoryStorage.html#M000027">new (Ankusa::MemoryStorage)</a><br />
|
76
|
-
<a href="classes/Ankusa/HBaseStorage.html#M000043">new (Ankusa::HBaseStorage)</a><br />
|
77
80
|
<a href="classes/String.html#M000001">numeric? (String)</a><br />
|
81
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000066">reset (Ankusa::FileSystemStorage)</a><br />
|
78
82
|
<a href="classes/Ankusa/HBaseStorage.html#M000045">reset (Ankusa::HBaseStorage)</a><br />
|
79
|
-
<a href="classes/Ankusa/MemoryStorage.html#M000029">reset (Ankusa::MemoryStorage)</a><br />
|
80
83
|
<a href="classes/Ankusa/CassandraStorage.html#M000011">reset (Ankusa::CassandraStorage)</a><br />
|
84
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000029">reset (Ankusa::MemoryStorage)</a><br />
|
85
|
+
<a href="classes/Ankusa/FileSystemStorage.html#M000069">save (Ankusa::FileSystemStorage)</a><br />
|
81
86
|
<a href="classes/Ankusa/HBaseStorage.html#M000058">summary_table (Ankusa::HBaseStorage)</a><br />
|
82
87
|
<a href="classes/String.html#M000002">to_ascii (String)</a><br />
|
83
88
|
<a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'ankusa/memory_storage'
|
2
|
+
|
3
|
+
module Ankusa
|
4
|
+
|
5
|
+
class FileSystemStorage < MemoryStorage
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
init_tables
|
10
|
+
end
|
11
|
+
|
12
|
+
def reset
|
13
|
+
@freqs = {}
|
14
|
+
@total_word_counts = Hash.new(0)
|
15
|
+
@total_doc_counts = Hash.new(0)
|
16
|
+
@klass_word_counts = {}
|
17
|
+
@klass_doc_counts = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def drop_tables
|
21
|
+
File.delete(@file) rescue Errno::ENOENT
|
22
|
+
reset
|
23
|
+
end
|
24
|
+
|
25
|
+
def init_tables
|
26
|
+
data = {}
|
27
|
+
begin
|
28
|
+
File.open(@file) do |f|
|
29
|
+
data = Marshal.load(f)
|
30
|
+
end
|
31
|
+
@freqs = data[:freqs]
|
32
|
+
@total_word_counts = data[:total_word_counts]
|
33
|
+
@total_doc_counts = data[:total_doc_counts]
|
34
|
+
@klass_word_counts = data[:klass_word_counts]
|
35
|
+
@klass_doc_counts = data[:klass_word_counts]
|
36
|
+
rescue Errno::ENOENT
|
37
|
+
reset
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def save(file = nil)
|
42
|
+
file ||= @file
|
43
|
+
data = { :freqs => @freqs,
|
44
|
+
:total_word_counts => @total_word_counts,
|
45
|
+
:total_doc_counts => @total_doc_counts,
|
46
|
+
:klass_word_counts => @klass_word_counts,
|
47
|
+
:klass_doc_counts => @klass_doc_counts }
|
48
|
+
File.open(file, 'w+') do |f|
|
49
|
+
Marshal.dump(data, f)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 8
|
10
|
+
version: 0.0.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- lib/ankusa/cassandra_storage.rb
|
47
47
|
- lib/ankusa/classifier.rb
|
48
48
|
- lib/ankusa/extensions.rb
|
49
|
+
- lib/ankusa/file_system_storage.rb
|
49
50
|
- lib/ankusa/hasher.rb
|
50
51
|
- lib/ankusa/hbase_storage.rb
|
51
52
|
- lib/ankusa/kl_divergence.rb
|
@@ -58,6 +59,7 @@ files:
|
|
58
59
|
- README.rdoc
|
59
60
|
- docs/classes/Ankusa/CassandraStorage.html
|
60
61
|
- docs/classes/Ankusa/Classifier.html
|
62
|
+
- docs/classes/Ankusa/FileSystemStorage.html
|
61
63
|
- docs/classes/Ankusa/HBaseStorage.html
|
62
64
|
- docs/classes/Ankusa/KLDivergenceClassifier.html
|
63
65
|
- docs/classes/Ankusa/MemoryStorage.html
|
@@ -69,6 +71,7 @@ files:
|
|
69
71
|
- docs/files/lib/ankusa/cassandra_storage_rb.html
|
70
72
|
- docs/files/lib/ankusa/classifier_rb.html
|
71
73
|
- docs/files/lib/ankusa/extensions_rb.html
|
74
|
+
- docs/files/lib/ankusa/file_system_storage_rb.html
|
72
75
|
- docs/files/lib/ankusa/hasher_rb.html
|
73
76
|
- docs/files/lib/ankusa/hbase_storage_rb.html
|
74
77
|
- docs/files/lib/ankusa/kl_divergence_rb.html
|