ankusa 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Classes
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Classes</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Classes</h1>
22
+ <div id="index-entries">
23
+ <a href="classes/Ankusa.html">Ankusa</a><br />
24
+ <a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
25
+ <a href="classes/Ankusa/NBClass.html">Ankusa::NBClass</a><br />
26
+ <a href="classes/Ankusa/TextHash.html">Ankusa::TextHash</a><br />
27
+ </div>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,32 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Files
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Files</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Files</h1>
22
+ <div id="index-entries">
23
+ <a href="files/README_rdoc.html">README.rdoc</a><br />
24
+ <a href="files/lib/ankusa_rb.html">lib/ankusa.rb</a><br />
25
+ <a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
26
+ <a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
27
+ <a href="files/lib/ankusa/nbclass_rb.html">lib/ankusa/nbclass.rb</a><br />
28
+ <a href="files/lib/ankusa/stopwords_rb.html">lib/ankusa/stopwords.rb</a><br />
29
+ </div>
30
+ </div>
31
+ </body>
32
+ </html>
@@ -0,0 +1,43 @@
1
+
2
+ <?xml version="1.0" encoding="iso-8859-1"?>
3
+ <!DOCTYPE html
4
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
+
7
+ <!--
8
+
9
+ Methods
10
+
11
+ -->
12
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
+ <head>
14
+ <title>Methods</title>
15
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
+ <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
+ <base target="docwin" />
18
+ </head>
19
+ <body>
20
+ <div id="index">
21
+ <h1 class="section-bar">Methods</h1>
22
+ <div id="index-entries">
23
+ <a href="classes/Ankusa/TextHash.html#M000016">add_text (Ankusa::TextHash)</a><br />
24
+ <a href="classes/Ankusa/TextHash.html#M000017">add_word (Ankusa::TextHash)</a><br />
25
+ <a href="classes/Ankusa/Classifier.html#M000005">classifications (Ankusa::Classifier)</a><br />
26
+ <a href="classes/Ankusa/Classifier.html#M000004">classify (Ankusa::Classifier)</a><br />
27
+ <a href="classes/Ankusa/Classifier.html#M000009">doc_count_total (Ankusa::Classifier)</a><br />
28
+ <a href="classes/Ankusa/Classifier.html#M000007">drop_tables (Ankusa::Classifier)</a><br />
29
+ <a href="classes/Ankusa/Classifier.html#M000013">freq_table (Ankusa::Classifier)</a><br />
30
+ <a href="classes/Ankusa/Classifier.html#M000010">get_word_probs (Ankusa::Classifier)</a><br />
31
+ <a href="classes/Ankusa/Classifier.html#M000011">init_tables (Ankusa::Classifier)</a><br />
32
+ <a href="classes/Ankusa/NBClass.html#M000014">new (Ankusa::NBClass)</a><br />
33
+ <a href="classes/Ankusa/TextHash.html#M000015">new (Ankusa::TextHash)</a><br />
34
+ <a href="classes/Ankusa/Classifier.html#M000001">new (Ankusa::Classifier)</a><br />
35
+ <a href="classes/Ankusa/Classifier.html#M000006">refresh_classnames (Ankusa::Classifier)</a><br />
36
+ <a href="classes/Ankusa/Classifier.html#M000008">reset (Ankusa::Classifier)</a><br />
37
+ <a href="classes/Ankusa/Classifier.html#M000012">summary_table (Ankusa::Classifier)</a><br />
38
+ <a href="classes/Ankusa/Classifier.html#M000002">train (Ankusa::Classifier)</a><br />
39
+ <a href="classes/Ankusa/Classifier.html#M000003">untrain (Ankusa::Classifier)</a><br />
40
+ </div>
41
+ </div>
42
+ </body>
43
+ </html>
data/docs/index.html ADDED
@@ -0,0 +1,24 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
5
+
6
+ <!--
7
+
8
+ HBaseRb - Naive Bayes classifier with HBase storage
9
+
10
+ -->
11
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12
+ <head>
13
+ <title>HBaseRb - Naive Bayes classifier with HBase storage</title>
14
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
15
+ </head>
16
+ <frameset rows="20%, 80%">
17
+ <frameset cols="25%,35%,45%">
18
+ <frame src="fr_file_index.html" title="Files" name="Files" />
19
+ <frame src="fr_class_index.html" name="Classes" />
20
+ <frame src="fr_method_index.html" name="Methods" />
21
+ </frameset>
22
+ <frame src="files/README_rdoc.html" name="docwin" />
23
+ </frameset>
24
+ </html>
@@ -0,0 +1,208 @@
1
+
2
+ body {
3
+ font-family: Verdana,Arial,Helvetica,sans-serif;
4
+ font-size: 90%;
5
+ margin: 0;
6
+ margin-left: 40px;
7
+ padding: 0;
8
+ background: white;
9
+ }
10
+
11
+ h1,h2,h3,h4 { margin: 0; color: #efefef; background: transparent; }
12
+ h1 { font-size: 150%; }
13
+ h2,h3,h4 { margin-top: 1em; }
14
+
15
+ a { background: #eef; color: #039; text-decoration: none; }
16
+ a:hover { background: #039; color: #eef; }
17
+
18
+ /* Override the base stylesheet's Anchor inside a table cell */
19
+ td > a {
20
+ background: transparent;
21
+ color: #039;
22
+ text-decoration: none;
23
+ }
24
+
25
+ /* and inside a section title */
26
+ .section-title > a {
27
+ background: transparent;
28
+ color: #eee;
29
+ text-decoration: none;
30
+ }
31
+
32
+ /* === Structural elements =================================== */
33
+
34
+ div#index {
35
+ margin: 0;
36
+ margin-left: -40px;
37
+ padding: 0;
38
+ font-size: 90%;
39
+ }
40
+
41
+
42
+ div#index a {
43
+ margin-left: 0.7em;
44
+ }
45
+
46
+ div#index .section-bar {
47
+ margin-left: 0px;
48
+ padding-left: 0.7em;
49
+ background: #ccc;
50
+ font-size: small;
51
+ }
52
+
53
+
54
+ div#classHeader, div#fileHeader {
55
+ width: auto;
56
+ color: white;
57
+ padding: 0.5em 1.5em 0.5em 1.5em;
58
+ margin: 0;
59
+ margin-left: -40px;
60
+ border-bottom: 3px solid #006;
61
+ }
62
+
63
+ div#classHeader a, div#fileHeader a {
64
+ background: inherit;
65
+ color: white;
66
+ }
67
+
68
+ div#classHeader td, div#fileHeader td {
69
+ background: inherit;
70
+ color: white;
71
+ }
72
+
73
+
74
+ div#fileHeader {
75
+ background: #057;
76
+ }
77
+
78
+ div#classHeader {
79
+ background: #048;
80
+ }
81
+
82
+
83
+ .class-name-in-header {
84
+ font-size: 180%;
85
+ font-weight: bold;
86
+ }
87
+
88
+
89
+ div#bodyContent {
90
+ padding: 0 1.5em 0 1.5em;
91
+ }
92
+
93
+ div#description {
94
+ padding: 0.5em 1.5em;
95
+ background: #efefef;
96
+ border: 1px dotted #999;
97
+ }
98
+
99
+ div#description h1,h2,h3,h4,h5,h6 {
100
+ color: #125;;
101
+ background: transparent;
102
+ }
103
+
104
+ div#validator-badges {
105
+ text-align: center;
106
+ }
107
+ div#validator-badges img { border: 0; }
108
+
109
+ div#copyright {
110
+ color: #333;
111
+ background: #efefef;
112
+ font: 0.75em sans-serif;
113
+ margin-top: 5em;
114
+ margin-bottom: 0;
115
+ padding: 0.5em 2em;
116
+ }
117
+
118
+
119
+ /* === Classes =================================== */
120
+
121
+ table.header-table {
122
+ color: white;
123
+ font-size: small;
124
+ }
125
+
126
+ .type-note {
127
+ font-size: small;
128
+ color: #DEDEDE;
129
+ }
130
+
131
+ .xxsection-bar {
132
+ background: #eee;
133
+ color: #333;
134
+ padding: 3px;
135
+ }
136
+
137
+ .section-bar {
138
+ color: #333;
139
+ border-bottom: 1px solid #999;
140
+ margin-left: -20px;
141
+ }
142
+
143
+
144
+ .section-title {
145
+ background: #79a;
146
+ color: #eee;
147
+ padding: 3px;
148
+ margin-top: 2em;
149
+ margin-left: -30px;
150
+ border: 1px solid #999;
151
+ }
152
+
153
+ .top-aligned-row { vertical-align: top }
154
+ .bottom-aligned-row { vertical-align: bottom }
155
+
156
+ /* --- Context section classes ----------------------- */
157
+
158
+ .context-row { }
159
+ .context-item-name { font-family: monospace; font-weight: bold; color: black; }
160
+ .context-item-value { font-size: small; color: #448; }
161
+ .context-item-desc { color: #333; padding-left: 2em; }
162
+
163
+ /* --- Method classes -------------------------- */
164
+ .method-detail {
165
+ background: #efefef;
166
+ padding: 0;
167
+ margin-top: 0.5em;
168
+ margin-bottom: 1em;
169
+ border: 1px dotted #ccc;
170
+ }
171
+ .method-heading {
172
+ color: black;
173
+ background: #ccc;
174
+ border-bottom: 1px solid #666;
175
+ padding: 0.2em 0.5em 0 0.5em;
176
+ }
177
+ .method-signature { color: black; background: inherit; }
178
+ .method-name { font-weight: bold; }
179
+ .method-args { font-style: italic; }
180
+ .method-description { padding: 0 0.5em 0 0.5em; }
181
+
182
+ /* --- Source code sections -------------------- */
183
+
184
+ a.source-toggle { font-size: 90%; }
185
+ div.method-source-code {
186
+ background: #262626;
187
+ color: #ffdead;
188
+ margin: 1em;
189
+ padding: 0.5em;
190
+ border: 1px dashed #999;
191
+ overflow: hidden;
192
+ }
193
+
194
+ div.method-source-code pre { color: #ffdead; overflow: hidden; }
195
+
196
+ /* --- Ruby keyword styles --------------------- */
197
+
198
+ .standalone-code { background: #221111; color: #ffdead; overflow: hidden; }
199
+
200
+ .ruby-constant { color: #7fffd4; background: transparent; }
201
+ .ruby-keyword { color: #00ffff; background: transparent; }
202
+ .ruby-ivar { color: #eedd82; background: transparent; }
203
+ .ruby-operator { color: #00ffee; background: transparent; }
204
+ .ruby-identifier { color: #ffdead; background: transparent; }
205
+ .ruby-node { color: #ffa07a; background: transparent; }
206
+ .ruby-comment { color: #b22222; font-weight: bold; background: transparent; }
207
+ .ruby-regexp { color: #ffa07a; background: transparent; }
208
+ .ruby-value { color: #7fffd4; background: transparent; }
data/lib/ankusa.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'ankusa/classifier'
2
+ require 'ankusa/hasher'
3
+ require 'ankusa/nbclass'
@@ -0,0 +1,125 @@
1
+ module Ankusa
2
+ SMALL_PROB = 0.0001
3
+
4
+ class Classifier
5
+ attr_reader :classnames
6
+
7
+ def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
8
+ @hbase = hbase_client
9
+ @ftablename = frequency_tablename
10
+ @stablename = summary_tablename
11
+ init_tables
12
+ @classnames = refresh_classnames
13
+ end
14
+
15
+ def train(klass, text)
16
+ th = TextHash.new(text)
17
+ th.each { |word, count|
18
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", count
19
+ }
20
+ summary_table.atomic_increment klass, "totals:wordcount", th.word_count
21
+ summary_table.atomic_increment klass, "totals:doccount"
22
+ @classnames << klass if not @classnames.include? klass
23
+ end
24
+
25
+ def untrain(klass, text)
26
+ th = TextHash.new(text)
27
+ th.each { |word, count|
28
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
29
+ }
30
+ summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
31
+ summary_table.atomic_increment klass, "totals:doccount", -1
32
+ end
33
+
34
+ def classify(text)
35
+ # return the most probable class
36
+ classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
37
+ end
38
+
39
+ def classifications(text)
40
+ classes = {}
41
+ result = {}
42
+ @classnames.each { |k|
43
+ classes[k] = NBClass.new k, summary_table, freq_table
44
+ result[k] = 0
45
+ }
46
+
47
+ TextHash.new(text).each { |word,count|
48
+ probs = get_word_probs(word, classes)
49
+ @classnames.each { |k| result[k] += Math.log(probs[k]) }
50
+ }
51
+
52
+ @classnames.each { |k| result[k] += Math.log(classes[k].doc_count / doc_count_total) }
53
+
54
+ result.keys.each { |k| result[k] = Math.exp(result[k]) }
55
+ sum = result.values.inject { |x,y| x+y }
56
+ result.keys.each { |klass|
57
+ result[klass] = result[klass] / sum
58
+ }
59
+
60
+ result
61
+ end
62
+
63
+ # get all classes
64
+ def refresh_classnames
65
+ cs = []
66
+ summary_table.create_scanner("", "totals") { |row|
67
+ cs << row.row.intern
68
+ }
69
+ cs
70
+ end
71
+
72
+ def drop_tables
73
+ freq_table.delete
74
+ summary_table.delete
75
+ @stable = nil
76
+ @ftable = nil
77
+ end
78
+
79
+ def reset
80
+ drop_tables
81
+ init_tables
82
+ end
83
+
84
+ def doc_count_total
85
+ total = 0
86
+ summary_table.create_scanner("", "totals:doccount") { |row|
87
+ total += row.columns["totals:doccount"].to_i64
88
+ }
89
+ total
90
+ end
91
+
92
+ protected
93
+ def get_word_probs(word, classes)
94
+ probs = {}
95
+ @classnames.each { |cn| probs[cn] = Ankusa::SMALL_PROB / classes[cn].word_count }
96
+ row = freq_table.get_row(word)
97
+ return probs if row.length == 0
98
+
99
+ row.first.columns.each { |colname, cell|
100
+ classname = colname.split(':')[1].intern
101
+ probs[classname] = cell.to_i64.to_f / classes[classname].word_count
102
+ }
103
+ probs
104
+ end
105
+
106
+ def init_tables
107
+ if not @hbase.has_table? @ftablename
108
+ @hbase.create_table @ftablename, "classes", "total"
109
+ end
110
+
111
+ if not @hbase.has_table? @stablename
112
+ @hbase.create_table @stablename, "totals"
113
+ end
114
+ end
115
+
116
+ def summary_table
117
+ @stable ||= @hbase.get_table @stablename
118
+ end
119
+
120
+ def freq_table
121
+ @ftable ||= @hbase.get_table @ftablename
122
+ end
123
+ end
124
+
125
+ end