ankusa 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,30 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Classes
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Classes</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Classes</h1>
22
+ <div id="index-entries">
23
+ <a href="classes/Ankusa.html">Ankusa</a><br />
24
+ <a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
25
+ <a href="classes/Ankusa/NBClass.html">Ankusa::NBClass</a><br />
26
+ <a href="classes/Ankusa/TextHash.html">Ankusa::TextHash</a><br />
27
+ </div>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,32 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Files
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Files</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Files</h1>
22
+ <div id="index-entries">
23
+ <a href="files/README_rdoc.html">README.rdoc</a><br />
24
+ <a href="files/lib/ankusa_rb.html">lib/ankusa.rb</a><br />
25
+ <a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
26
+ <a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
27
+ <a href="files/lib/ankusa/nbclass_rb.html">lib/ankusa/nbclass.rb</a><br />
28
+ <a href="files/lib/ankusa/stopwords_rb.html">lib/ankusa/stopwords.rb</a><br />
29
+ </div>
30
+ </div>
31
+ </body>
32
+ </html>
@@ -0,0 +1,43 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Methods
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Methods</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Methods</h1>
22
+ <div id="index-entries">
23
+ <a href="classes/Ankusa/TextHash.html#M000016">add_text (Ankusa::TextHash)</a><br />
24
+ <a href="classes/Ankusa/TextHash.html#M000017">add_word (Ankusa::TextHash)</a><br />
25
+ <a href="classes/Ankusa/Classifier.html#M000005">classifications (Ankusa::Classifier)</a><br />
26
+ <a href="classes/Ankusa/Classifier.html#M000004">classify (Ankusa::Classifier)</a><br />
27
+ <a href="classes/Ankusa/Classifier.html#M000009">doc_count_total (Ankusa::Classifier)</a><br />
28
+ <a href="classes/Ankusa/Classifier.html#M000007">drop_tables (Ankusa::Classifier)</a><br />
29
+ <a href="classes/Ankusa/Classifier.html#M000013">freq_table (Ankusa::Classifier)</a><br />
30
+ <a href="classes/Ankusa/Classifier.html#M000010">get_word_probs (Ankusa::Classifier)</a><br />
31
+ <a href="classes/Ankusa/Classifier.html#M000011">init_tables (Ankusa::Classifier)</a><br />
32
+ <a href="classes/Ankusa/NBClass.html#M000014">new (Ankusa::NBClass)</a><br />
33
+ <a href="classes/Ankusa/TextHash.html#M000015">new (Ankusa::TextHash)</a><br />
34
+ <a href="classes/Ankusa/Classifier.html#M000001">new (Ankusa::Classifier)</a><br />
35
+ <a href="classes/Ankusa/Classifier.html#M000006">refresh_classnames (Ankusa::Classifier)</a><br />
36
+ <a href="classes/Ankusa/Classifier.html#M000008">reset (Ankusa::Classifier)</a><br />
37
+ <a href="classes/Ankusa/Classifier.html#M000012">summary_table (Ankusa::Classifier)</a><br />
38
+ <a href="classes/Ankusa/Classifier.html#M000002">train (Ankusa::Classifier)</a><br />
39
+ <a href="classes/Ankusa/Classifier.html#M000003">untrain (Ankusa::Classifier)</a><br />
40
+ </div>
41
+ </div>
42
+ </body>
43
+ </html>
data/docs/index.html ADDED
@@ -0,0 +1,24 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
5
+
6
+ <!--
7
+
8
+ HBaseRb - Naive Bayes classifier with HBase storage
9
+
10
+ -->
11
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12
+ <head>
13
+ <title>HBaseRb - Naive Bayes classifier with HBase storage</title>
14
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
15
+ </head>
16
+ <frameset rows="20%, 80%">
17
+ <frameset cols="25%,35%,45%">
18
+ <frame src="fr_file_index.html" title="Files" name="Files" />
19
+ <frame src="fr_class_index.html" name="Classes" />
20
+ <frame src="fr_method_index.html" name="Methods" />
21
+ </frameset>
22
+ <frame src="files/README_rdoc.html" name="docwin" />
23
+ </frameset>
24
+ </html>
@@ -0,0 +1,208 @@
1
+
2
+ body {
3
+ font-family: Verdana,Arial,Helvetica,sans-serif;
4
+ font-size: 90%;
5
+ margin: 0;
6
+ margin-left: 40px;
7
+ padding: 0;
8
+ background: white;
9
+ }
10
+
11
+ h1,h2,h3,h4 { margin: 0; color: #efefef; background: transparent; }
12
+ h1 { font-size: 150%; }
13
+ h2,h3,h4 { margin-top: 1em; }
14
+
15
+ a { background: #eef; color: #039; text-decoration: none; }
16
+ a:hover { background: #039; color: #eef; }
17
+
18
+ /* Override the base stylesheet's Anchor inside a table cell */
19
+ td > a {
20
+ background: transparent;
21
+ color: #039;
22
+ text-decoration: none;
23
+ }
24
+
25
+ /* and inside a section title */
26
+ .section-title > a {
27
+ background: transparent;
28
+ color: #eee;
29
+ text-decoration: none;
30
+ }
31
+
32
+ /* === Structural elements =================================== */
33
+
34
+ div#index {
35
+ margin: 0;
36
+ margin-left: -40px;
37
+ padding: 0;
38
+ font-size: 90%;
39
+ }
40
+
41
+
42
+ div#index a {
43
+ margin-left: 0.7em;
44
+ }
45
+
46
+ div#index .section-bar {
47
+ margin-left: 0px;
48
+ padding-left: 0.7em;
49
+ background: #ccc;
50
+ font-size: small;
51
+ }
52
+
53
+
54
+ div#classHeader, div#fileHeader {
55
+ width: auto;
56
+ color: white;
57
+ padding: 0.5em 1.5em 0.5em 1.5em;
58
+ margin: 0;
59
+ margin-left: -40px;
60
+ border-bottom: 3px solid #006;
61
+ }
62
+
63
+ div#classHeader a, div#fileHeader a {
64
+ background: inherit;
65
+ color: white;
66
+ }
67
+
68
+ div#classHeader td, div#fileHeader td {
69
+ background: inherit;
70
+ color: white;
71
+ }
72
+
73
+
74
+ div#fileHeader {
75
+ background: #057;
76
+ }
77
+
78
+ div#classHeader {
79
+ background: #048;
80
+ }
81
+
82
+
83
+ .class-name-in-header {
84
+ font-size: 180%;
85
+ font-weight: bold;
86
+ }
87
+
88
+
89
+ div#bodyContent {
90
+ padding: 0 1.5em 0 1.5em;
91
+ }
92
+
93
+ div#description {
94
+ padding: 0.5em 1.5em;
95
+ background: #efefef;
96
+ border: 1px dotted #999;
97
+ }
98
+
99
+ div#description h1,h2,h3,h4,h5,h6 {
100
+ color: #125;;
101
+ background: transparent;
102
+ }
103
+
104
+ div#validator-badges {
105
+ text-align: center;
106
+ }
107
+ div#validator-badges img { border: 0; }
108
+
109
+ div#copyright {
110
+ color: #333;
111
+ background: #efefef;
112
+ font: 0.75em sans-serif;
113
+ margin-top: 5em;
114
+ margin-bottom: 0;
115
+ padding: 0.5em 2em;
116
+ }
117
+
118
+
119
+ /* === Classes =================================== */
120
+
121
+ table.header-table {
122
+ color: white;
123
+ font-size: small;
124
+ }
125
+
126
+ .type-note {
127
+ font-size: small;
128
+ color: #DEDEDE;
129
+ }
130
+
131
+ .xxsection-bar {
132
+ background: #eee;
133
+ color: #333;
134
+ padding: 3px;
135
+ }
136
+
137
+ .section-bar {
138
+ color: #333;
139
+ border-bottom: 1px solid #999;
140
+ margin-left: -20px;
141
+ }
142
+
143
+
144
+ .section-title {
145
+ background: #79a;
146
+ color: #eee;
147
+ padding: 3px;
148
+ margin-top: 2em;
149
+ margin-left: -30px;
150
+ border: 1px solid #999;
151
+ }
152
+
153
+ .top-aligned-row { vertical-align: top }
154
+ .bottom-aligned-row { vertical-align: bottom }
155
+
156
+ /* --- Context section classes ----------------------- */
157
+
158
+ .context-row { }
159
+ .context-item-name { font-family: monospace; font-weight: bold; color: black; }
160
+ .context-item-value { font-size: small; color: #448; }
161
+ .context-item-desc { color: #333; padding-left: 2em; }
162
+
163
+ /* --- Method classes -------------------------- */
164
+ .method-detail {
165
+ background: #efefef;
166
+ padding: 0;
167
+ margin-top: 0.5em;
168
+ margin-bottom: 1em;
169
+ border: 1px dotted #ccc;
170
+ }
171
+ .method-heading {
172
+ color: black;
173
+ background: #ccc;
174
+ border-bottom: 1px solid #666;
175
+ padding: 0.2em 0.5em 0 0.5em;
176
+ }
177
+ .method-signature { color: black; background: inherit; }
178
+ .method-name { font-weight: bold; }
179
+ .method-args { font-style: italic; }
180
+ .method-description { padding: 0 0.5em 0 0.5em; }
181
+
182
+ /* --- Source code sections -------------------- */
183
+
184
+ a.source-toggle { font-size: 90%; }
185
+ div.method-source-code {
186
+ background: #262626;
187
+ color: #ffdead;
188
+ margin: 1em;
189
+ padding: 0.5em;
190
+ border: 1px dashed #999;
191
+ overflow: hidden;
192
+ }
193
+
194
+ div.method-source-code pre { color: #ffdead; overflow: hidden; }
195
+
196
+ /* --- Ruby keyword styles --------------------- */
197
+
198
+ .standalone-code { background: #221111; color: #ffdead; overflow: hidden; }
199
+
200
+ .ruby-constant { color: #7fffd4; background: transparent; }
201
+ .ruby-keyword { color: #00ffff; background: transparent; }
202
+ .ruby-ivar { color: #eedd82; background: transparent; }
203
+ .ruby-operator { color: #00ffee; background: transparent; }
204
+ .ruby-identifier { color: #ffdead; background: transparent; }
205
+ .ruby-node { color: #ffa07a; background: transparent; }
206
+ .ruby-comment { color: #b22222; font-weight: bold; background: transparent; }
207
+ .ruby-regexp { color: #ffa07a; background: transparent; }
208
+ .ruby-value { color: #7fffd4; background: transparent; }
data/lib/ankusa.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'ankusa/classifier'
2
+ require 'ankusa/hasher'
3
+ require 'ankusa/nbclass'
@@ -0,0 +1,125 @@
1
+ module Ankusa
2
+ SMALL_PROB = 0.0001
3
+
4
+ class Classifier
5
+ attr_reader :classnames
6
+
7
+ def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
8
+ @hbase = hbase_client
9
+ @ftablename = frequency_tablename
10
+ @stablename = summary_tablename
11
+ init_tables
12
+ @classnames = refresh_classnames
13
+ end
14
+
15
+ def train(klass, text)
16
+ th = TextHash.new(text)
17
+ th.each { |word, count|
18
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", count
19
+ }
20
+ summary_table.atomic_increment klass, "totals:wordcount", th.word_count
21
+ summary_table.atomic_increment klass, "totals:doccount"
22
+ @classnames << klass if not @classnames.include? klass
23
+ end
24
+
25
+ def untrain(klass, text)
26
+ th = TextHash.new(text)
27
+ th.each { |word, count|
28
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
29
+ }
30
+ summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
31
+ summary_table.atomic_increment klass, "totals:doccount", -1
32
+ end
33
+
34
+ def classify(text)
35
+ # return the most probable class
36
+ classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
37
+ end
38
+
39
+ def classifications(text)
40
+ classes = {}
41
+ result = {}
42
+ @classnames.each { |k|
43
+ classes[k] = NBClass.new k, summary_table, freq_table
44
+ result[k] = 0
45
+ }
46
+
47
+ TextHash.new(text).each { |word,count|
48
+ probs = get_word_probs(word, classes)
49
+ @classnames.each { |k| result[k] += Math.log(probs[k]) }
50
+ }
51
+
52
+ @classnames.each { |k| result[k] += Math.log(classes[k].doc_count / doc_count_total) }
53
+
54
+ result.keys.each { |k| result[k] = Math.exp(result[k]) }
55
+ sum = result.values.inject { |x,y| x+y }
56
+ result.keys.each { |klass|
57
+ result[klass] = result[klass] / sum
58
+ }
59
+
60
+ result
61
+ end
62
+
63
+ # get all classes
64
+ def refresh_classnames
65
+ cs = []
66
+ summary_table.create_scanner("", "totals") { |row|
67
+ cs << row.row.intern
68
+ }
69
+ cs
70
+ end
71
+
72
+ def drop_tables
73
+ freq_table.delete
74
+ summary_table.delete
75
+ @stable = nil
76
+ @ftable = nil
77
+ end
78
+
79
+ def reset
80
+ drop_tables
81
+ init_tables
82
+ end
83
+
84
+ def doc_count_total
85
+ total = 0
86
+ summary_table.create_scanner("", "totals:doccount") { |row|
87
+ total += row.columns["totals:doccount"].to_i64
88
+ }
89
+ total
90
+ end
91
+
92
+ protected
93
+ def get_word_probs(word, classes)
94
+ probs = {}
95
+ @classnames.each { |cn| probs[cn] = Ankusa::SMALL_PROB / classes[cn].word_count }
96
+ row = freq_table.get_row(word)
97
+ return probs if row.length == 0
98
+
99
+ row.first.columns.each { |colname, cell|
100
+ classname = colname.split(':')[1].intern
101
+ probs[classname] = cell.to_i64.to_f / classes[classname].word_count
102
+ }
103
+ probs
104
+ end
105
+
106
+ def init_tables
107
+ if not @hbase.has_table? @ftablename
108
+ @hbase.create_table @ftablename, "classes", "total"
109
+ end
110
+
111
+ if not @hbase.has_table? @stablename
112
+ @hbase.create_table @stablename, "totals"
113
+ end
114
+ end
115
+
116
+ def summary_table
117
+ @stable ||= @hbase.get_table @stablename
118
+ end
119
+
120
+ def freq_table
121
+ @ftable ||= @hbase.get_table @ftablename
122
+ end
123
+ end
124
+
125
+ end