ankusa 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,30 @@
1
+ = ankusa
2
+
3
+ Ankusa is a Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size.
4
+
5
+ == Installation
6
+ First, install HBase / Hadoop. Make sure the HBase Thrift interface has been started as well. Then:
7
+
8
+ gem install ankusa
9
+
10
+ == Basic Usage
11
+ require 'rubygems'
12
+ require 'ankusa'
13
+ require 'hbaserb'
14
+
15
+ # connect to HBase
16
+ client = HBaseRb::Client.new 'localhost'
17
+
18
+ c = Classifier.new client
19
+ c.train :spam, "This is some spammy text"
20
+ c.train :good, "This is not the bad stuff"
21
+
22
+ # This will return the most likely class (as symbol)
23
+ puts c.classify "This is some spammy text"
24
+
25
+ # This will return Hash with classes as keys and
26
+ # membership probability as values
27
+ puts c.classifications "This is some spammy text"
28
+
29
+ # get a list of all classes
30
+ puts c.classes
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+
7
+ desc "Create documentation"
8
+ Rake::RDocTask.new("doc") { |rdoc|
9
+ rdoc.title = "HBaseRb - Naive Bayes classifier with HBase storage"
10
+ rdoc.rdoc_dir = 'docs'
11
+ rdoc.rdoc_files.include('README.rdoc')
12
+ rdoc.rdoc_files.include('lib/**/*.rb')
13
+ }
14
+
15
+ # Run the unit tests
16
+ desc "Run all unit tests"
17
+ Rake::TestTask.new("test") { |t|
18
+ t.libs << "lib"
19
+ t.test_files = FileList['test/*_test.rb']
20
+ t.verbose = true
21
+ }
22
+
23
+ spec = Gem::Specification.new do |s|
24
+ s.name = "ankusa"
25
+ s.version = "0.0.2"
26
+ s.authors = ["Brian Muller"]
27
+ s.date = %q{2010-11-29}
28
+ s.description = "Naive Bayes classifier with HBase storage"
29
+ s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
30
+ s.email = "brian.muller@livingsocial.com"
31
+ s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
32
+ s.homepage = "https://github.com/livingsocial/ankusa"
33
+ s.require_paths = ["lib"]
34
+ s.rubygems_version = "1.3.5"
35
+ s.add_dependency('hbaserb', '>= 0.0.1')
36
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
37
+ end
38
+
39
+ Rake::GemPackageTask.new(spec) do |pkg|
40
+ pkg.need_zip = true
41
+ pkg.need_tar = true
42
+ end
43
+
44
+ desc "Default task: builds gem and runs tests"
45
+ task :default => [ :gem, :test ]
@@ -0,0 +1,149 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Module: Ankusa</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Module</strong></td>
53
+ <td class="class-name-in-header">Ankusa</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/ankusa/classifier_rb.html">
59
+ lib/ankusa/classifier.rb
60
+ </a>
61
+ <br />
62
+ <a href="../files/lib/ankusa/hasher_rb.html">
63
+ lib/ankusa/hasher.rb
64
+ </a>
65
+ <br />
66
+ <a href="../files/lib/ankusa/nbclass_rb.html">
67
+ lib/ankusa/nbclass.rb
68
+ </a>
69
+ <br />
70
+ <a href="../files/lib/ankusa/stopwords_rb.html">
71
+ lib/ankusa/stopwords.rb
72
+ </a>
73
+ <br />
74
+ </td>
75
+ </tr>
76
+
77
+ </table>
78
+ </div>
79
+ <!-- banner header -->
80
+
81
+ <div id="bodyContent">
82
+
83
+
84
+
85
+ <div id="contextContent">
86
+
87
+
88
+
89
+ </div>
90
+
91
+
92
+ </div>
93
+
94
+
95
+ <!-- if includes -->
96
+
97
+ <div id="section">
98
+
99
+ <div id="class-list">
100
+ <h3 class="section-bar">Classes and Modules</h3>
101
+
102
+ Class <a href="Ankusa/Classifier.html" class="link">Ankusa::Classifier</a><br />
103
+ Class <a href="Ankusa/NBClass.html" class="link">Ankusa::NBClass</a><br />
104
+ Class <a href="Ankusa/TextHash.html" class="link">Ankusa::TextHash</a><br />
105
+
106
+ </div>
107
+
108
+ <div id="constants-list">
109
+ <h3 class="section-bar">Constants</h3>
110
+
111
+ <div class="name-list">
112
+ <table summary="Constants">
113
+ <tr class="top-aligned-row context-row">
114
+ <td class="context-item-name">SMALL_PROB</td>
115
+ <td>=</td>
116
+ <td class="context-item-value">0.0001</td>
117
+ </tr>
118
+ <tr class="top-aligned-row context-row">
119
+ <td class="context-item-name">STOPWORDS</td>
120
+ <td>=</td>
121
+ <td class="context-item-value">&quot;a able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently definitely described despite did didn't different do does doesn't doing don't done down downwards during each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except far few fifth first five followed following follows for former formerly forth four from further furthermore get gets getting given gives go goes going gone got gotten greetings had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself just keep keeps kept know knows known last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own particular particularly per perhaps placed please plus possible presumably probably provides que quite qv rather rd re really reasonably regarding regardless regards relatively respectively right said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two un under unfortunately unless unlikely until unto up upon us use used useful uses using usually value various very via viz vs want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't yes yet you you'd you'll you're you've your yours yourself yourselves zero&quot;.split</td>
122
+ <td width="3em">&nbsp;</td>
123
+ <td class="context-item-desc">
124
+ These are taken from MySQL - <a
125
+ href="http://dev.mysql.com/tech-resources/articles/full-text-revealed.html">dev.mysql.com/tech-resources/articles/full-text-revealed.html</a>
126
+
127
+ </td>
128
+ </tr>
129
+ </table>
130
+ </div>
131
+ </div>
132
+
133
+
134
+
135
+
136
+
137
+
138
+ <!-- if method_list -->
139
+
140
+
141
+ </div>
142
+
143
+
144
+ <div id="validator-badges">
145
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
146
+ </div>
147
+
148
+ </body>
149
+ </html>
@@ -0,0 +1,517 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Ankusa::Classifier</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Ankusa::Classifier</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/ankusa/classifier_rb.html">
59
+ lib/ankusa/classifier.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000005">classifications</a>&nbsp;&nbsp;
90
+ <a href="#M000004">classify</a>&nbsp;&nbsp;
91
+ <a href="#M000009">doc_count_total</a>&nbsp;&nbsp;
92
+ <a href="#M000007">drop_tables</a>&nbsp;&nbsp;
93
+ <a href="#M000013">freq_table</a>&nbsp;&nbsp;
94
+ <a href="#M000010">get_word_probs</a>&nbsp;&nbsp;
95
+ <a href="#M000011">init_tables</a>&nbsp;&nbsp;
96
+ <a href="#M000001">new</a>&nbsp;&nbsp;
97
+ <a href="#M000006">refresh_classnames</a>&nbsp;&nbsp;
98
+ <a href="#M000008">reset</a>&nbsp;&nbsp;
99
+ <a href="#M000012">summary_table</a>&nbsp;&nbsp;
100
+ <a href="#M000002">train</a>&nbsp;&nbsp;
101
+ <a href="#M000003">untrain</a>&nbsp;&nbsp;
102
+ </div>
103
+ </div>
104
+
105
+ </div>
106
+
107
+
108
+ <!-- if includes -->
109
+
110
+ <div id="section">
111
+
112
+
113
+
114
+
115
+
116
+ <div id="attribute-list">
117
+ <h3 class="section-bar">Attributes</h3>
118
+
119
+ <div class="name-list">
120
+ <table>
121
+ <tr class="top-aligned-row context-row">
122
+ <td class="context-item-name">classnames</td>
123
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
124
+ <td class="context-item-desc"></td>
125
+ </tr>
126
+ </table>
127
+ </div>
128
+ </div>
129
+
130
+
131
+
132
+ <!-- if method_list -->
133
+ <div id="methods">
134
+ <h3 class="section-bar">Public Class methods</h3>
135
+
136
+ <div id="method-M000001" class="method-detail">
137
+ <a name="M000001"></a>
138
+
139
+ <div class="method-heading">
140
+ <a href="#M000001" class="method-signature">
141
+ <span class="method-name">new</span><span class="method-args">(hbase_client, frequency_tablename=&quot;ankusa_word_frequencies&quot;, summary_tablename=&quot;ankusa_summary&quot;)</span>
142
+ </a>
143
+ </div>
144
+
145
+ <div class="method-description">
146
+ <p><a class="source-toggle" href="#"
147
+ onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
148
+ <div class="method-source-code" id="M000001-source">
149
+ <pre>
150
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 7</span>
151
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">hbase_client</span>, <span class="ruby-identifier">frequency_tablename</span>=<span class="ruby-value str">&quot;ankusa_word_frequencies&quot;</span>, <span class="ruby-identifier">summary_tablename</span>=<span class="ruby-value str">&quot;ankusa_summary&quot;</span>)
152
+ <span class="ruby-ivar">@hbase</span> = <span class="ruby-identifier">hbase_client</span>
153
+ <span class="ruby-ivar">@ftablename</span> = <span class="ruby-identifier">frequency_tablename</span>
154
+ <span class="ruby-ivar">@stablename</span> = <span class="ruby-identifier">summary_tablename</span>
155
+ <span class="ruby-identifier">init_tables</span>
156
+ <span class="ruby-ivar">@classnames</span> = <span class="ruby-identifier">refresh_classnames</span>
157
+ <span class="ruby-keyword kw">end</span>
158
+ </pre>
159
+ </div>
160
+ </div>
161
+ </div>
162
+
163
+ <h3 class="section-bar">Public Instance methods</h3>
164
+
165
+ <div id="method-M000005" class="method-detail">
166
+ <a name="M000005"></a>
167
+
168
+ <div class="method-heading">
169
+ <a href="#M000005" class="method-signature">
170
+ <span class="method-name">classifications</span><span class="method-args">(text)</span>
171
+ </a>
172
+ </div>
173
+
174
+ <div class="method-description">
175
+ <p><a class="source-toggle" href="#"
176
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
177
+ <div class="method-source-code" id="M000005-source">
178
+ <pre>
179
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 39</span>
180
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
181
+ <span class="ruby-identifier">classes</span> = {}
182
+ <span class="ruby-identifier">result</span> = {}
183
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
184
+ <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">NBClass</span>.<span class="ruby-identifier">new</span> <span class="ruby-identifier">k</span>, <span class="ruby-identifier">summary_table</span>, <span class="ruby-identifier">freq_table</span>
185
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span>
186
+ }
187
+
188
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>,<span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
189
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
190
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) }
191
+ }
192
+
193
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">classes</span>[<span class="ruby-identifier">k</span>].<span class="ruby-identifier">doc_count</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) }
194
+
195
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) }
196
+ <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
197
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">klass</span><span class="ruby-operator">|</span>
198
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">klass</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span>
199
+ }
200
+
201
+ <span class="ruby-identifier">result</span>
202
+ <span class="ruby-keyword kw">end</span>
203
+ </pre>
204
+ </div>
205
+ </div>
206
+ </div>
207
+
208
+ <div id="method-M000004" class="method-detail">
209
+ <a name="M000004"></a>
210
+
211
+ <div class="method-heading">
212
+ <a href="#M000004" class="method-signature">
213
+ <span class="method-name">classify</span><span class="method-args">(text)</span>
214
+ </a>
215
+ </div>
216
+
217
+ <div class="method-description">
218
+ <p><a class="source-toggle" href="#"
219
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
220
+ <div class="method-source-code" id="M000004-source">
221
+ <pre>
222
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 34</span>
223
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
224
+ <span class="ruby-comment cmt"># return the most probable class</span>
225
+ <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">o</span>,<span class="ruby-identifier">t</span><span class="ruby-operator">|</span> <span class="ruby-identifier">o</span>[<span class="ruby-value">1</span>] <span class="ruby-operator">&lt;=&gt;</span> <span class="ruby-identifier">t</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
226
+ <span class="ruby-keyword kw">end</span>
227
+ </pre>
228
+ </div>
229
+ </div>
230
+ </div>
231
+
232
+ <div id="method-M000009" class="method-detail">
233
+ <a name="M000009"></a>
234
+
235
+ <div class="method-heading">
236
+ <a href="#M000009" class="method-signature">
237
+ <span class="method-name">doc_count_total</span><span class="method-args">()</span>
238
+ </a>
239
+ </div>
240
+
241
+ <div class="method-description">
242
+ <p><a class="source-toggle" href="#"
243
+ onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
244
+ <div class="method-source-code" id="M000009-source">
245
+ <pre>
246
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 84</span>
247
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_total</span>
248
+ <span class="ruby-identifier">total</span> = <span class="ruby-value">0</span>
249
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
250
+ <span class="ruby-identifier">total</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">columns</span>[<span class="ruby-value str">&quot;totals:doccount&quot;</span>].<span class="ruby-identifier">to_i64</span>
251
+ }
252
+ <span class="ruby-identifier">total</span>
253
+ <span class="ruby-keyword kw">end</span>
254
+ </pre>
255
+ </div>
256
+ </div>
257
+ </div>
258
+
259
+ <div id="method-M000007" class="method-detail">
260
+ <a name="M000007"></a>
261
+
262
+ <div class="method-heading">
263
+ <a href="#M000007" class="method-signature">
264
+ <span class="method-name">drop_tables</span><span class="method-args">()</span>
265
+ </a>
266
+ </div>
267
+
268
+ <div class="method-description">
269
+ <p><a class="source-toggle" href="#"
270
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
271
+ <div class="method-source-code" id="M000007-source">
272
+ <pre>
273
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 72</span>
274
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">drop_tables</span>
275
+ <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">delete</span>
276
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">delete</span>
277
+ <span class="ruby-ivar">@stable</span> = <span class="ruby-keyword kw">nil</span>
278
+ <span class="ruby-ivar">@ftable</span> = <span class="ruby-keyword kw">nil</span>
279
+ <span class="ruby-keyword kw">end</span>
280
+ </pre>
281
+ </div>
282
+ </div>
283
+ </div>
284
+
285
+ <div id="method-M000006" class="method-detail">
286
+ <a name="M000006"></a>
287
+
288
+ <div class="method-heading">
289
+ <a href="#M000006" class="method-signature">
290
+ <span class="method-name">refresh_classnames</span><span class="method-args">()</span>
291
+ </a>
292
+ </div>
293
+
294
+ <div class="method-description">
295
+ <p>
296
+ get all classes
297
+ </p>
298
+ <p><a class="source-toggle" href="#"
299
+ onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
300
+ <div class="method-source-code" id="M000006-source">
301
+ <pre>
302
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 64</span>
303
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">refresh_classnames</span>
304
+ <span class="ruby-identifier">cs</span> = []
305
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">create_scanner</span>(<span class="ruby-value str">&quot;&quot;</span>, <span class="ruby-value str">&quot;totals&quot;</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">row</span><span class="ruby-operator">|</span>
306
+ <span class="ruby-identifier">cs</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">row</span>.<span class="ruby-identifier">intern</span>
307
+ }
308
+ <span class="ruby-identifier">cs</span>
309
+ <span class="ruby-keyword kw">end</span>
310
+ </pre>
311
+ </div>
312
+ </div>
313
+ </div>
314
+
315
+ <div id="method-M000008" class="method-detail">
316
+ <a name="M000008"></a>
317
+
318
+ <div class="method-heading">
319
+ <a href="#M000008" class="method-signature">
320
+ <span class="method-name">reset</span><span class="method-args">()</span>
321
+ </a>
322
+ </div>
323
+
324
+ <div class="method-description">
325
+ <p><a class="source-toggle" href="#"
326
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
327
+ <div class="method-source-code" id="M000008-source">
328
+ <pre>
329
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 79</span>
330
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">reset</span>
331
+ <span class="ruby-identifier">drop_tables</span>
332
+ <span class="ruby-identifier">init_tables</span>
333
+ <span class="ruby-keyword kw">end</span>
334
+ </pre>
335
+ </div>
336
+ </div>
337
+ </div>
338
+
339
+ <div id="method-M000002" class="method-detail">
340
+ <a name="M000002"></a>
341
+
342
+ <div class="method-heading">
343
+ <a href="#M000002" class="method-signature">
344
+ <span class="method-name">train</span><span class="method-args">(klass, text)</span>
345
+ </a>
346
+ </div>
347
+
348
+ <div class="method-description">
349
+ <p><a class="source-toggle" href="#"
350
+ onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
351
+ <div class="method-source-code" id="M000002-source">
352
+ <pre>
353
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 15</span>
354
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">train</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
355
+ <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
356
+ <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
357
+ <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-identifier">count</span>
358
+ }
359
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
360
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>
361
+ <span class="ruby-ivar">@classnames</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
362
+ <span class="ruby-keyword kw">end</span>
363
+ </pre>
364
+ </div>
365
+ </div>
366
+ </div>
367
+
368
+ <div id="method-M000003" class="method-detail">
369
+ <a name="M000003"></a>
370
+
371
+ <div class="method-heading">
372
+ <a href="#M000003" class="method-signature">
373
+ <span class="method-name">untrain</span><span class="method-args">(klass, text)</span>
374
+ </a>
375
+ </div>
376
+
377
+ <div class="method-description">
378
+ <p><a class="source-toggle" href="#"
379
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
380
+ <div class="method-source-code" id="M000003-source">
381
+ <pre>
382
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 25</span>
383
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
384
+ <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
385
+ <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
386
+ <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">word</span>, <span class="ruby-node">&quot;classes:#{klass.to_s}&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
387
+ }
388
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:wordcount&quot;</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
389
+ <span class="ruby-identifier">summary_table</span>.<span class="ruby-identifier">atomic_increment</span> <span class="ruby-identifier">klass</span>, <span class="ruby-value str">&quot;totals:doccount&quot;</span>, <span class="ruby-value">-1</span>
390
+ <span class="ruby-keyword kw">end</span>
391
+ </pre>
392
+ </div>
393
+ </div>
394
+ </div>
395
+
396
+ <h3 class="section-bar">Protected Instance methods</h3>
397
+
398
+ <div id="method-M000013" class="method-detail">
399
+ <a name="M000013"></a>
400
+
401
+ <div class="method-heading">
402
+ <a href="#M000013" class="method-signature">
403
+ <span class="method-name">freq_table</span><span class="method-args">()</span>
404
+ </a>
405
+ </div>
406
+
407
+ <div class="method-description">
408
+ <p><a class="source-toggle" href="#"
409
+ onclick="toggleCode('M000013-source');return false;">[Source]</a></p>
410
+ <div class="method-source-code" id="M000013-source">
411
+ <pre>
412
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 120</span>
413
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">freq_table</span>
414
+ <span class="ruby-ivar">@ftable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@ftablename</span>
415
+ <span class="ruby-keyword kw">end</span>
416
+ </pre>
417
+ </div>
418
+ </div>
419
+ </div>
420
+
421
+ <div id="method-M000010" class="method-detail">
422
+ <a name="M000010"></a>
423
+
424
+ <div class="method-heading">
425
+ <a href="#M000010" class="method-signature">
426
+ <span class="method-name">get_word_probs</span><span class="method-args">(word, classes)</span>
427
+ </a>
428
+ </div>
429
+
430
+ <div class="method-description">
431
+ <p><a class="source-toggle" href="#"
432
+ onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
433
+ <div class="method-source-code" id="M000010-source">
434
+ <pre>
435
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 93</span>
436
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classes</span>)
437
+ <span class="ruby-identifier">probs</span> = {}
438
+ <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = <span class="ruby-constant">Ankusa</span><span class="ruby-operator">::</span><span class="ruby-constant">SMALL_PROB</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">cn</span>].<span class="ruby-identifier">word_count</span> }
439
+ <span class="ruby-identifier">row</span> = <span class="ruby-identifier">freq_table</span>.<span class="ruby-identifier">get_row</span>(<span class="ruby-identifier">word</span>)
440
+ <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">probs</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">row</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">==</span> <span class="ruby-value">0</span>
441
+
442
+ <span class="ruby-identifier">row</span>.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">columns</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">colname</span>, <span class="ruby-identifier">cell</span><span class="ruby-operator">|</span>
443
+ <span class="ruby-identifier">classname</span> = <span class="ruby-identifier">colname</span>.<span class="ruby-identifier">split</span>(<span class="ruby-value str">':'</span>)[<span class="ruby-value">1</span>].<span class="ruby-identifier">intern</span>
444
+ <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">classname</span>] = <span class="ruby-identifier">cell</span>.<span class="ruby-identifier">to_i64</span>.<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">classes</span>[<span class="ruby-identifier">classname</span>].<span class="ruby-identifier">word_count</span>
445
+ }
446
+ <span class="ruby-identifier">probs</span>
447
+ <span class="ruby-keyword kw">end</span>
448
+ </pre>
449
+ </div>
450
+ </div>
451
+ </div>
452
+
453
+ <div id="method-M000011" class="method-detail">
454
+ <a name="M000011"></a>
455
+
456
+ <div class="method-heading">
457
+ <a href="#M000011" class="method-signature">
458
+ <span class="method-name">init_tables</span><span class="method-args">()</span>
459
+ </a>
460
+ </div>
461
+
462
+ <div class="method-description">
463
+ <p><a class="source-toggle" href="#"
464
+ onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
465
+ <div class="method-source-code" id="M000011-source">
466
+ <pre>
467
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 106</span>
468
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">init_tables</span>
469
+ <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@ftablename</span>
470
+ <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@ftablename</span>, <span class="ruby-value str">&quot;classes&quot;</span>, <span class="ruby-value str">&quot;total&quot;</span>
471
+ <span class="ruby-keyword kw">end</span>
472
+
473
+ <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">has_table?</span> <span class="ruby-ivar">@stablename</span>
474
+ <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">create_table</span> <span class="ruby-ivar">@stablename</span>, <span class="ruby-value str">&quot;totals&quot;</span>
475
+ <span class="ruby-keyword kw">end</span>
476
+ <span class="ruby-keyword kw">end</span>
477
+ </pre>
478
+ </div>
479
+ </div>
480
+ </div>
481
+
482
+ <div id="method-M000012" class="method-detail">
483
+ <a name="M000012"></a>
484
+
485
+ <div class="method-heading">
486
+ <a href="#M000012" class="method-signature">
487
+ <span class="method-name">summary_table</span><span class="method-args">()</span>
488
+ </a>
489
+ </div>
490
+
491
+ <div class="method-description">
492
+ <p><a class="source-toggle" href="#"
493
+ onclick="toggleCode('M000012-source');return false;">[Source]</a></p>
494
+ <div class="method-source-code" id="M000012-source">
495
+ <pre>
496
+ <span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 116</span>
497
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">summary_table</span>
498
+ <span class="ruby-ivar">@stable</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@hbase</span>.<span class="ruby-identifier">get_table</span> <span class="ruby-ivar">@stablename</span>
499
+ <span class="ruby-keyword kw">end</span>
500
+ </pre>
501
+ </div>
502
+ </div>
503
+ </div>
504
+
505
+
506
+ </div>
507
+
508
+
509
+ </div>
510
+
511
+
512
+ <div id="validator-badges">
513
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
514
+ </div>
515
+
516
+ </body>
517
+ </html>